diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 561019bb6554..0bb062e218a6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -17840,26 +17840,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { auto Op = RMW->getOperation(); switch (Op) { - case AtomicRMWInst::Xchg: { + case AtomicRMWInst::Xchg: // PCIe supports add and xchg for system atomics. return isAtomicRMWLegalXChgTy(RMW) ? TargetLowering::AtomicExpansionKind::None : TargetLowering::AtomicExpansionKind::CmpXChg; - } case AtomicRMWInst::Add: - case AtomicRMWInst::And: - case AtomicRMWInst::UIncWrap: - case AtomicRMWInst::UDecWrap: + // PCIe supports add and xchg for system atomics. return atomicSupportedIfLegalIntType(RMW); case AtomicRMWInst::Sub: + case AtomicRMWInst::And: case AtomicRMWInst::Or: - case AtomicRMWInst::Xor: { - // Atomic sub/or/xor do not work over PCI express, but atomic add - // does. InstCombine transforms these with 0 to or, so undo that. - if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) { - if (Constant *ConstVal = dyn_cast(RMW->getValOperand()); - ConstVal && ConstVal->isNullValue()) - return AtomicExpansionKind::Expand; + case AtomicRMWInst::Xor: + case AtomicRMWInst::Max: + case AtomicRMWInst::Min: + case AtomicRMWInst::UMax: + case AtomicRMWInst::UMin: + case AtomicRMWInst::UIncWrap: + case AtomicRMWInst::UDecWrap: { + if (AMDGPU::isFlatGlobalAddrSpace(AS) || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { + // Always expand system scope atomics. + if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) { + if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or || + Op == AtomicRMWInst::Xor) { + // Atomic sub/or/xor do not work over PCI express, but atomic add + // does. InstCombine transforms these with 0 to or, so undo that. + if (Constant *ConstVal = dyn_cast(RMW->getValOperand()); + ConstVal && ConstVal->isNullValue()) + return AtomicExpansionKind::Expand; + } + + return AtomicExpansionKind::CmpXChg; + } } return atomicSupportedIfLegalIntType(RMW); @@ -18014,18 +18027,6 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AtomicExpansionKind::CmpXChg; } - case AtomicRMWInst::Min: - case AtomicRMWInst::Max: - case AtomicRMWInst::UMin: - case AtomicRMWInst::UMax: { - if (AMDGPU::isFlatGlobalAddrSpace(AS) || - AS == AMDGPUAS::BUFFER_FAT_POINTER) { - if (HasSystemScope && !Subtarget->hasEmulatedSystemScopeAtomics()) - return AtomicExpansionKind::CmpXChg; - } - - return atomicSupportedIfLegalIntType(RMW); - } case AtomicRMWInst::Nand: case AtomicRMWInst::FSub: default: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll index f2035c278713..98d498002d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll @@ -462,19 +462,36 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_not_b32_e32 v2, 41 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: s_load_dword s6, s[2:3], 0x4 +; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s6 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v4, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd5, v4 +; CI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; CI-NEXT: v_cndmask_b32_e64 v3, v3, 42, vcc +; CI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_system: @@ -482,56 +499,128 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_not_b32_e32 v2, 41 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: s_load_dword s6, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s4, s2, 16 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0xffffffd5, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; VI-NEXT: v_cndmask_b32_e64 v3, v3, 42, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_not_b32_e32 v0, 41 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffd5, v3 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; GFX9-NEXT: global_atomic_cmpswap v2, v1, v[2:3], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffd5, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v2 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, 42, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffd5, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v3, 42, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -686,16 +775,32 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CI-NEXT: v_not_b32_e32 v4, 41 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: s_load_dword s4, s[0:1], 0x4 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd5, v3 +; CI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v4 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_system: @@ -703,50 +808,115 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; VI-NEXT: v_not_b32_e32 v4, 41 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: s_load_dword s4, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0xffffffd5, v3 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_not_b32_e32 v2, 41 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffd5, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 42, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffd5, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -1, v1 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, 42, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xffffffd5, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -1, v1 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v3, 42, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1132,18 +1302,34 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_not_b32_e32 v2, 41 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB14_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, v3 +; CI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd5, v4 +; CI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; CI-NEXT: v_cndmask_b32_e64 v3, v3, 42, vcc +; CI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB14_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system: @@ -1152,18 +1338,34 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_not_b32_e32 v2, 41 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB14_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0xffffffd5, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; VI-NEXT: v_cndmask_b32_e64 v3, v3, 42, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB14_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system: @@ -1171,16 +1373,32 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_not_b32_e32 v2, 41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v5, 0xffffffd5, v4 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 42, vcc +; GFX9-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB14_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system: @@ -1190,16 +1408,31 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffd5, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, -1, v3 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 42, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB14_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_store_dword v[0:1], v2 @@ -1208,13 +1441,31 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 +; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffd5, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v3 +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 42, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB14_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -1396,15 +1647,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_not_b32_e32 v4, 41 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_dec v[0:1], v2 +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: .LBB17_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; CI-NEXT: v_add_i32_e32 v5, vcc, 0xffffffd5, v3 +; CI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v4 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB17_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system: @@ -1413,15 +1679,30 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_not_b32_e32 v4, 41 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: .LBB17_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_add_u32_e32 v5, vcc, 0xffffffd5, v3 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v4 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB17_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system: @@ -1429,13 +1710,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_not_b32_e32 v4, 41 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v5, 0xffffffd5, v3 +; GFX9-NEXT: v_add_u32_e32 v2, -1, v3 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v5, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 42, vcc +; GFX9-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB17_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system: @@ -1445,30 +1741,58 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_dec v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffd5, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v4, -1, v3 +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 42, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execnz .LBB17_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:16 +; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffd5, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v4, -1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0xffffffd6, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 42, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB17_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -2113,17 +2437,41 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: v_not_b32_e32 v6, 41 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_add_u32 s0, s0, 36 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v2, v[4:5] +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: v_mov_b32_e32 v7, -1 +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: .LBB24_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v2 +; CI-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 0xffffffd5, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; CI-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7] +; CI-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB24_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system: @@ -2131,32 +2479,75 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: v_not_b32_e32 v6, 41 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_add_u32 s0, s0, 36 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_load_dword v2, v[4:5] +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v7, -1 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: .LBB24_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffd5, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB24_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_not_b32_e32 v6, 41 +; GFX9-NEXT: v_mov_b32_e32 v7, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffffd5, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB24_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system: @@ -2166,32 +2557,72 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_not_b32_e32 v6, 41 +; GFX10-NEXT: v_mov_b32_e32 v7, -1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffffd5, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v2, -1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execnz .LBB24_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_not_b32_e32 v6, 41 +; GFX11-NEXT: v_mov_b32_e32 v7, -1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: flat_load_b64 v[2:3], v[4:5] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffffd5, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v2, -1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB24_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = atomicrmw udec_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 @@ -3025,20 +3456,42 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: v_not_b32_e32 v0, 41 +; CI-NEXT: v_mov_b32_e32 v1, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x8 +; CI-NEXT: s_add_u32 s4, s2, 32 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v4, s6 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: v_mov_b32_e32 v5, s7 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: .LBB34_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v7, v5 +; CI-NEXT: v_mov_b32_e32 v6, v4 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v6 +; CI-NEXT: v_addc_u32_e32 v9, vcc, -1, v7, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd5, v6 +; CI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; CI-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] +; CI-NEXT: v_cndmask_b32_e64 v4, v8, 42, vcc +; CI-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB34_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_system: @@ -3046,60 +3499,155 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: v_not_b32_e32 v0, 41 +; VI-NEXT: v_mov_b32_e32 v1, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 +; VI-NEXT: s_add_u32 s4, s2, 32 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v6 +; VI-NEXT: v_addc_u32_e32 v9, vcc, -1, v7, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffd5, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[0:1] +; VI-NEXT: v_cndmask_b32_e64 v4, v8, 42, vcc +; VI-NEXT: v_cndmask_b32_e64 v5, v9, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_not_b32_e32 v0, 41 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v1, -1 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v8, v3 +; GFX9-NEXT: v_mov_b32_e32 v7, v2 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, -1, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v8, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffffd5, v7 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v8, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 42, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[5:8], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[7:8] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_not_b32_e32 v0, 41 +; GFX10-NEXT: v_mov_b32_e32 v1, -1 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v8, v3 +; GFX10-NEXT: v_mov_b32_e32 v7, v2 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd5, v7 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v8, vcc_lo +; GFX10-NEXT: v_add_co_u32 v5, vcc_lo, v7, -1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v8, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 42, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap_x2 v[2:3], v4, v[5:8], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[7:8] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB34_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_not_b32_e32 v0, 41 +; GFX11-NEXT: v_dual_mov_b32 v1, -1 :: v_dual_mov_b32 v4, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x20 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2 +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffffd5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v8, vcc_lo +; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, v7, -1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, -1, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 42, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b64 v[2:3], v4, v[5:8], s[2:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[7:8] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB34_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b64 v0, v[2:3], s[0:1] ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -3264,17 +3812,38 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: v_not_b32_e32 v4, 41 +; CI-NEXT: v_mov_b32_e32 v5, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; CI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v7, s3 +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v6, s2 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CI-NEXT: .LBB37_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v8, vcc, -1, v2 +; CI-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; CI-NEXT: v_add_i32_e32 v0, vcc, 0xffffffd5, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; CI-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; CI-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB37_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_system: @@ -3282,54 +3851,141 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: v_not_b32_e32 v4, 41 +; VI-NEXT: v_mov_b32_e32 v5, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v7, s3 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v8, vcc, -1, v2 +; VI-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; VI-NEXT: v_add_u32_e32 v0, vcc, 0xffffffd5, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; VI-NEXT: v_cndmask_b32_e64 v0, v8, 42, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:7], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_not_b32_e32 v4, 41 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, -1 +; GFX9-NEXT: v_mov_b32_e32 v6, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, -1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, -1, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffffd5, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v7, 42, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_not_b32_e32 v4, 41 +; GFX10-NEXT: v_mov_b32_e32 v5, -1 +; GFX10-NEXT: v_mov_b32_e32 v6, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffffd5, v2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32 v7, vcc_lo, v2, -1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v8, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, 42, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB37_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_dec_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX11-NEXT: v_not_b32_e32 v4, 41 +; GFX11-NEXT: v_dual_mov_b32 v5, -1 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x20 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xffffffd5, v2 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v3, vcc_lo +; GFX11-NEXT: v_add_co_u32 v7, vcc_lo, v2, -1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, -1, v3, vcc_lo +; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, 42, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB37_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll index 80c743cac484..0ccfbd845639 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll @@ -530,15 +530,30 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: s_load_dword s6, s[2:3], 0x4 +; CI-NEXT: s_add_u32 s4, s2, 16 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s6 +; CI-NEXT: .LBB6_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB6_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -550,15 +565,30 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: s_load_dword s6, s[2:3], 0x10 +; VI-NEXT: s_add_u32 s4, s2, 16 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: .LBB6_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB6_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -567,51 +597,123 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX9-NEXT: s_load_dword s6, s[2:3], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_add_u32_e32 v1, 1, v2 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: global_store_dword v1, v0, s[0:1] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB6_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc v0, v1, v0, s[2:3] offset:16 glc +; GFX10-NEXT: s_load_dword s4, s[2:3], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s4 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: global_store_dword v1, v0, s[0:1] +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB6_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 glc +; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mov_b32_e32 v2, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB6_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_inc_ret_i32_offset_sistem: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x10 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mov_b32_e32 v2, v1 +; GFX12-NEXT: v_add_nc_u32_e32 v1, 1, v2 +; GFX12-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB6_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -787,15 +889,29 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: s_load_dword s4, s[0:1], 0x4 +; CI-NEXT: s_add_u32 s2, s0, 16 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s4 +; CI-NEXT: .LBB9_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB9_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_system: @@ -804,61 +920,137 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: s_load_dword s4, s[0:1], 0x10 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s4 +; VI-NEXT: .LBB9_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB9_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x10 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB9_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc v1, v0, s[0:1] offset:16 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_load_dword s2, s[0:1], 0x10 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB9_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB9_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_inc_noret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x10 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2 +; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_nc_u32_e32 v0, 1, v1 +; GFX12-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX12-NEXT: v_mov_b32_e32 v1, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_execnz .LBB9_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_endpgm %gep = getelementptr i32, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 42 seq_cst, align 4 @@ -1736,20 +1928,38 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 -; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8 +; CI-NEXT: s_add_u32 s6, s2, 32 +; CI-NEXT: s_addc_u32 s7, s3, 0 +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s4 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: v_mov_b32_e32 v3, s5 +; CI-NEXT: .LBB19_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_mov_b32_e32 v5, v3 +; CI-NEXT: v_mov_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[4:5] +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB19_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_system: @@ -1757,73 +1967,176 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; VI-NEXT: s_add_u32 s6, s2, 32 +; VI-NEXT: s_addc_u32 s7, s3, 0 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: .LBB19_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_mov_b32_e32 v4, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[4:5] +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB19_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[5:6] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB19_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_mov_b32_e32 v6, v1 +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: s_cbranch_execnz .LBB19_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 glc +; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_mov_b32 s4, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX11-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[2:3] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: s_cbranch_execnz .LBB19_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_inc_ret_i64_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x20 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX12-NEXT: s_mov_b32 s4, 0 +; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v5, 1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v6, vcc_lo +; GFX12-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[5:6] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v2, v[3:6], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[5:6] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: s_cbranch_execnz .LBB19_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX12-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 @@ -2011,17 +2324,34 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 -; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8 +; CI-NEXT: s_add_u32 s4, s0, 32 +; CI-NEXT: s_addc_u32 s5, s1, 0 +; CI-NEXT: v_mov_b32_e32 v4, s4 +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_mov_b32_e32 v5, s5 +; CI-NEXT: v_mov_b32_e32 v3, s3 +; CI-NEXT: .LBB22_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB22_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_system: @@ -2029,67 +2359,160 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 -; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x20 +; VI-NEXT: s_add_u32 s4, s0, 32 +; VI-NEXT: s_addc_u32 s5, s1, 0 +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: .LBB22_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB22_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v3, s5 +; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB22_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x20 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB22_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x20 +; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB22_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: global_atomic_inc_noret_i64_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x20 +; GFX12-NEXT: v_mov_b32_e32 v4, 0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 +; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX12-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt 0x0 +; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_execnz .LBB22_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_endpgm %gep = getelementptr i64, ptr addrspace(1) %ptr, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 42 seq_cst, align 8 @@ -2538,15 +2961,29 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: s_addc_u32 s3, s3, 0 ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB27_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB27_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -2558,15 +2995,29 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB27_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB27_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 @@ -2577,13 +3028,27 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc +; GFX9-NEXT: flat_load_dword v2, v[0:1] offset:16 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB27_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: flat_store_dword v[0:1], v2 @@ -2596,16 +3061,30 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 16 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-NEXT: flat_atomic_inc v2, v[0:1], v2 glc +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: flat_load_dword v2, v[0:1] +; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB27_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: flat_store_dword v[0:1], v2 @@ -2614,13 +3093,29 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX11-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 -; GFX11-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 glc +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:16 +; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB27_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_endpgm @@ -2628,14 +3123,32 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr % ; GFX12-LABEL: flat_atomic_inc_ret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: flat_load_b32 v2, v[0:1] offset:16 +; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX12-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_execnz .LBB27_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm @@ -2839,15 +3352,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: s_addc_u32 s1, s1, 0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: flat_atomic_inc v[0:1], v2 +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: .LBB30_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; CI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v3, v2 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB30_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system: @@ -2856,15 +3382,28 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system: @@ -2872,13 +3411,26 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GFX9-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v2, 1, v3 +; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 42, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system: @@ -2888,43 +3440,86 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v2, 42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 16 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: flat_atomic_inc v[0:1], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: flat_load_dword v3, v[0:1] +; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execnz .LBB30_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v2, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX11-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:16 +; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB30_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_noret_i32_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v2, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16 +; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_nc_u32_e32 v2, 1, v3 +; GFX12-NEXT: v_cmp_le_u32_e32 vcc_lo, 42, v3 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB30_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i32, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 42 seq_cst, align 4 @@ -3583,25 +4178,46 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 +; CI-NEXT: s_add_u32 s4, s2, 32 +; CI-NEXT: s_addc_u32 s5, s3, 0 +; CI-NEXT: s_add_u32 s2, s2, 36 ; CI-NEXT: s_addc_u32 s3, s3, 0 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; CI-NEXT: v_mov_b32_e32 v0, s4 +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_mov_b32_e32 v1, s5 +; CI-NEXT: v_mov_b32_e32 v3, s2 +; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: flat_load_dword v3, v[3:4] +; CI-NEXT: s_mov_b64 s[2:3], 0 +; CI-NEXT: .LBB36_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v5, v3 +; CI-NEXT: v_mov_b32_e32 v4, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 1, v4 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[4:5] +; CI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; CI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; CI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; CI-NEXT: s_cbranch_execnz .LBB36_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end +; CI-NEXT: s_or_b64 exec, exec, s[2:3] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_add_u32 s0, s0, 4 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v0 -; CI-NEXT: flat_store_dword v[4:5], v1 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system: @@ -3609,43 +4225,80 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 +; VI-NEXT: s_add_u32 s4, s2, 32 +; VI-NEXT: s_addc_u32 s5, s3, 0 +; VI-NEXT: s_add_u32 s2, s2, 36 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s2 +; VI-NEXT: flat_load_dword v2, v[0:1] +; VI-NEXT: flat_load_dword v3, v[3:4] +; VI-NEXT: s_mov_b64 s[2:3], 0 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v3 +; VI-NEXT: v_mov_b32_e32 v4, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v4 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[4:5] +; VI-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; VI-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; VI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[2:3] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_add_u32 s0, s0, 4 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v0 -; VI-NEXT: flat_store_dword v[4:5], v1 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, v3 +; GFX9-NEXT: v_mov_b32_e32 v4, v2 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 1, v4 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5] +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system: @@ -3655,51 +4308,107 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr % ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s2, s2, 32 ; GFX10-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, v3 +; GFX10-NEXT: v_mov_b32_e32 v4, v2 +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: s_cbranch_execnz .LBB36_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v4, 1 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v5, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:32 glc ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_cbranch_execnz .LBB36_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_ret_i64_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, 42 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 -; GFX12-NEXT: v_mov_b32_e32 v3, s3 +; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX12-NEXT: s_mov_b32 s2, 0 +; GFX12-NEXT: flat_load_b64 v[2:3], v[0:1] offset:32 +; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, 1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v5, vcc_lo +; GFX12-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5] +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: s_cbranch_execnz .LBB36_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: flat_store_b64 v[0:1], v[2:3] ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 @@ -3912,17 +4621,37 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CI-NEXT: s_add_i32 s12, s12, s17 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 +; CI-NEXT: s_add_u32 s2, s0, 32 +; CI-NEXT: s_addc_u32 s3, s1, 0 +; CI-NEXT: v_mov_b32_e32 v5, s3 +; CI-NEXT: s_add_u32 s0, s0, 36 +; CI-NEXT: v_mov_b32_e32 v4, s2 ; CI-NEXT: s_addc_u32 s1, s1, 0 -; CI-NEXT: v_mov_b32_e32 v3, s1 -; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_load_dword v2, v[4:5] +; CI-NEXT: flat_load_dword v3, v[0:1] +; CI-NEXT: s_mov_b64 s[0:1], 0 +; CI-NEXT: .LBB39_1: ; %atomicrmw.start +; CI-NEXT: ; =>This Inner Loop Header: Depth=1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; CI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; CI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; CI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; CI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; CI-NEXT: v_mov_b32_e32 v3, v1 +; CI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; CI-NEXT: v_mov_b32_e32 v2, v0 +; CI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; CI-NEXT: s_cbranch_execnz .LBB39_1 +; CI-NEXT: ; %bb.2: ; %atomicrmw.end ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system: @@ -3930,32 +4659,67 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; VI-NEXT: s_add_i32 s12, s12, s17 ; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_add_u32 s2, s0, 32 +; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_add_u32 s0, s0, 36 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_load_dword v2, v[4:5] +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[0:1], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[0:1] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17 -; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[0:1], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_le_u64_e32 vcc, 42, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system: @@ -3965,46 +4729,96 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s0 -; GFX10-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl1_inv ; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 +; GFX10-NEXT: s_cbranch_execnz .LBB39_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; GFX11-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: flat_load_b64 v[2:3], v[4:5] offset:32 +; GFX11-NEXT: .p2align 6 +; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX11-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo +; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 glc +; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: buffer_gl1_inv ; GFX11-NEXT: buffer_gl0_inv +; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX11-NEXT: s_cbranch_execnz .LBB39_1 +; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: flat_atomic_inc_noret_i64_offset_system: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 -; GFX12-NEXT: v_mov_b32_e32 v0, 42 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 +; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: flat_load_b64 v[2:3], v[4:5] offset:32 +; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v2, 1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX12-NEXT: v_cmp_le_u64_e32 vcc_lo, 42, v[2:3] +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo ; GFX12-NEXT: global_wb scope:SCOPE_SYS ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS -; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS +; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; GFX12-NEXT: s_cbranch_execnz .LBB39_1 +; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-NEXT: s_endpgm %gep = getelementptr inbounds i64, ptr %ptr, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i64 42 seq_cst, align 8, !noalias.addrspace !0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 1d3368b036d0..df60fb522554 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -7119,502 +7119,714 @@ entry: define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i8 %val) { ; GFX7LESS-LABEL: uniform_or_i8: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s12, s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB12_4 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s8, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_and_b32 s7, s6, 0xff -; GFX7LESS-NEXT: s_lshl_b32 s7, s7, s2 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s7 -; GFX7LESS-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX7LESS-NEXT: .LBB12_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_and_b32 s4, s10, -4 +; GFX7LESS-NEXT: s_mov_b32 s5, s11 +; GFX7LESS-NEXT: s_and_b32 s0, s10, 3 +; GFX7LESS-NEXT: s_and_b32 s1, s12, 0xff +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], 0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_lshl_b32 s13, s0, 3 +; GFX7LESS-NEXT: s_lshl_b32 s14, s1, s13 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_2 +; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7LESS-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX7LESS-NEXT: .LBB12_4: ; %Flow +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s12 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX7LESS-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7LESS-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: uniform_or_i8: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB12_2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB12_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s8, s2, -4 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: s_and_b32 s3, s6, 0xff -; GFX8-NEXT: s_lshl_b32 s3, s3, s2 +; GFX8-NEXT: s_and_b32 s4, s10, -4 +; GFX8-NEXT: s_mov_b32 s5, s11 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s0, s10, 3 +; GFX8-NEXT: s_lshl_b32 s13, s0, 3 +; GFX8-NEXT: s_and_b32 s0, s12, 0xff +; GFX8-NEXT: s_lshl_b32 s14, s0, s13 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX8-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB12_2 +; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX8-NEXT: .LBB12_4: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX8-NEXT: .LBB12_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: uniform_or_i8: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB12_2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB12_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s2, -4 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: s_and_b32 s3, s6, 0xff -; GFX9-NEXT: s_lshl_b32 s3, s3, s2 +; GFX9-NEXT: s_and_b32 s4, s10, -4 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_and_b32 s0, s10, 3 +; GFX9-NEXT: s_lshl_b32 s13, s0, 3 +; GFX9-NEXT: s_and_b32 s0, s12, 0xff +; GFX9-NEXT: s_lshl_b32 s14, s0, s13 +; GFX9-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX9-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX9-NEXT: s_cbranch_execnz .LBB12_2 +; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX9-NEXT: .LBB12_4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX9-NEXT: .LBB12_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: uniform_or_i8: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB12_2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB12_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s7, s2, 3 -; GFX1064-NEXT: s_and_b32 s8, s6, 0xff -; GFX1064-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1064-NEXT: s_and_b32 s8, s2, -4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s9 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc +; GFX1064-NEXT: s_and_b32 s4, s10, -4 +; GFX1064-NEXT: s_mov_b32 s5, s11 +; GFX1064-NEXT: s_and_b32 s1, s10, 3 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1064-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1064-NEXT: s_and_b32 s1, s12, 0xff +; GFX1064-NEXT: s_mov_b64 s[10:11], 0 +; GFX1064-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1064-NEXT: .LBB12_2: -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX1064-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1064-NEXT: .LBB12_4: ; %Flow +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc +; GFX1064-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: uniform_or_i8: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x34 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB12_2 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB12_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s5, s2, 3 -; GFX1032-NEXT: s_and_b32 s7, s6, 0xff -; GFX1032-NEXT: s_lshl_b32 s5, s5, 3 -; GFX1032-NEXT: s_and_b32 s8, s2, -4 -; GFX1032-NEXT: s_lshl_b32 s7, s7, s5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s7 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc +; GFX1032-NEXT: s_and_b32 s4, s10, -4 +; GFX1032-NEXT: s_mov_b32 s5, s11 +; GFX1032-NEXT: s_and_b32 s6, s10, 3 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1032-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1032-NEXT: s_and_b32 s6, s1, 0xff +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 -; GFX1032-NEXT: .LBB12_2: -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: s_or_b32 s3, s0, s3 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1032-NEXT: .LBB12_4: ; %Flow +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_byte v0, off, s[0:3], 0 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo +; GFX1032-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_byte v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-TRUE16-LABEL: uniform_or_i8: ; GFX1164-TRUE16: ; %bb.0: ; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1164-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1164-TRUE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1164-TRUE16-NEXT: ; %bb.1: ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1164-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1164-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1164-TRUE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1164-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1164-TRUE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1164-TRUE16-NEXT: s_and_b32 s1, s12, 0xff +; GFX1164-TRUE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1164-TRUE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-TRUE16-NEXT: .LBB12_2: -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1164-TRUE16-NEXT: .LBB12_4: ; %Flow +; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc +; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1164-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1164-TRUE16-NEXT: s_endpgm ; ; GFX1164-FAKE16-LABEL: uniform_or_i8: ; GFX1164-FAKE16: ; %bb.0: ; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1164-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1164-FAKE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1164-FAKE16-NEXT: ; %bb.1: ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1164-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1164-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1164-FAKE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1164-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1164-FAKE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1164-FAKE16-NEXT: s_and_b32 s1, s12, 0xff +; GFX1164-FAKE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1164-FAKE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-FAKE16-NEXT: .LBB12_2: -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1164-FAKE16-NEXT: .LBB12_4: ; %Flow +; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1164-FAKE16-NEXT: s_endpgm ; ; GFX1132-TRUE16-LABEL: uniform_or_i8: ; GFX1132-TRUE16: ; %bb.0: ; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1132-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1132-TRUE16-NEXT: ; %bb.1: ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1132-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1132-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1132-TRUE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1132-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1132-TRUE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1132-TRUE16-NEXT: s_and_b32 s6, s1, 0xff +; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-TRUE16-NEXT: .LBB12_2: -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1132-TRUE16-NEXT: .LBB12_4: ; %Flow +; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo +; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1132-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1132-TRUE16-NEXT: s_endpgm ; ; GFX1132-FAKE16-LABEL: uniform_or_i8: ; GFX1132-FAKE16: ; %bb.0: ; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1132-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1132-FAKE16-NEXT: ; %bb.1: ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1132-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1132-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1132-FAKE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1132-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1132-FAKE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1132-FAKE16-NEXT: s_and_b32 s6, s1, 0xff +; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-FAKE16-NEXT: .LBB12_2: -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1132-FAKE16-NEXT: .LBB12_4: ; %Flow +; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo +; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], 0 +; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm ; ; GFX1264-TRUE16-LABEL: uniform_or_i8: ; GFX1264-TRUE16: ; %bb.0: ; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1264-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1264-TRUE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1264-TRUE16-NEXT: ; %bb.1: ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1264-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1264-TRUE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1264-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1264-TRUE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1264-TRUE16-NEXT: s_and_b32 s1, s12, 0xff +; GFX1264-TRUE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1264-TRUE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1264-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-TRUE16-NEXT: .LBB12_2: -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1264-TRUE16-NEXT: .LBB12_4: ; %Flow +; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc +; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc +; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1264-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], null ; GFX1264-TRUE16-NEXT: s_endpgm ; ; GFX1264-FAKE16-LABEL: uniform_or_i8: ; GFX1264-FAKE16: ; %bb.0: ; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1264-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1264-FAKE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1264-FAKE16-NEXT: ; %bb.1: ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s6, 0xff -; GFX1264-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1264-FAKE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1264-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1264-FAKE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1264-FAKE16-NEXT: s_and_b32 s1, s12, 0xff +; GFX1264-FAKE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1264-FAKE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1264-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-FAKE16-NEXT: .LBB12_2: -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1264-FAKE16-NEXT: .LBB12_4: ; %Flow +; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc +; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1264-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], null ; GFX1264-FAKE16-NEXT: s_endpgm ; ; GFX1232-TRUE16-LABEL: uniform_or_i8: ; GFX1232-TRUE16: ; %bb.0: ; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1232-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1232-TRUE16-NEXT: ; %bb.1: ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1232-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1232-TRUE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1232-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1232-TRUE16-NEXT: s_and_b32 s6, s1, 0xff +; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-TRUE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1232-TRUE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-TRUE16-NEXT: .LBB12_2: -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1232-TRUE16-NEXT: .LBB12_4: ; %Flow +; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo +; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo +; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1232-TRUE16-NEXT: buffer_store_b8 v0, off, s[8:11], null ; GFX1232-TRUE16-NEXT: s_endpgm ; ; GFX1232-FAKE16-LABEL: uniform_or_i8: ; GFX1232-FAKE16: ; %bb.0: ; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1232-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB12_2 +; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB12_4 ; GFX1232-FAKE16-NEXT: ; %bb.1: ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s7, s4, 0xff -; GFX1232-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1232-FAKE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1232-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1232-FAKE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1232-FAKE16-NEXT: s_and_b32 s6, s1, 0xff +; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-FAKE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1232-FAKE16-NEXT: .LBB12_2: ; %atomicrmw.start +; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-FAKE16-NEXT: .LBB12_2: -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB12_2 +; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1232-FAKE16-NEXT: .LBB12_4: ; %Flow +; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo +; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo +; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[0:3], null +; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1232-FAKE16-NEXT: buffer_store_b8 v0, off, s[8:11], null ; GFX1232-FAKE16-NEXT: s_endpgm %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i8 %val monotonic, align 1 store i8 %rmw, ptr addrspace(1) %result @@ -8812,498 +9024,710 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspace(1) %uniform.ptr, i16 %val) { ; GFX7LESS-LABEL: uniform_or_i16: ; GFX7LESS: ; %bb.0: -; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0xd +; GFX7LESS-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GFX7LESS-NEXT: s_load_dword s12, s[4:5], 0xd ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz .LBB15_2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB15_4 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_and_b32 s8, s2, -4 -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_and_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_lshl_b32 s2, s2, 3 -; GFX7LESS-NEXT: s_and_b32 s7, s6, 0xffff -; GFX7LESS-NEXT: s_lshl_b32 s7, s7, s2 -; GFX7LESS-NEXT: s_mov_b32 s10, -1 -; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s7 -; GFX7LESS-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GFX7LESS-NEXT: v_bfe_u32 v0, v0, s2, 16 -; GFX7LESS-NEXT: .LBB15_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_and_b32 s4, s10, -4 +; GFX7LESS-NEXT: s_mov_b32 s5, s11 +; GFX7LESS-NEXT: s_and_b32 s0, s10, 3 +; GFX7LESS-NEXT: s_and_b32 s1, s12, 0xffff +; GFX7LESS-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[10:11], 0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_lshl_b32 s13, s0, 3 +; GFX7LESS-NEXT: s_lshl_b32 s14, s1, s13 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_2 +; GFX7LESS-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX7LESS-NEXT: v_bfe_u32 v0, v2, s13, 16 +; GFX7LESS-NEXT: .LBB15_4: ; %Flow +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s12 +; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 ; GFX7LESS-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GFX7LESS-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX7LESS-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX7LESS-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX7LESS-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: uniform_or_i16: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX8-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX8-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz .LBB15_2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB15_4 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s8, s2, -4 -; GFX8-NEXT: s_and_b32 s2, s2, 3 -; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: s_lshl_b32 s2, s2, 3 -; GFX8-NEXT: s_and_b32 s3, 0xffff, s6 -; GFX8-NEXT: s_lshl_b32 s3, s3, s2 +; GFX8-NEXT: s_and_b32 s4, s10, -4 +; GFX8-NEXT: s_mov_b32 s5, s11 +; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX8-NEXT: s_and_b32 s0, s10, 3 +; GFX8-NEXT: s_lshl_b32 s13, s0, 3 +; GFX8-NEXT: s_and_b32 s0, 0xffff, s12 +; GFX8-NEXT: s_lshl_b32 s14, s0, s13 +; GFX8-NEXT: s_mov_b64 s[10:11], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX8-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX8-NEXT: v_mov_b32_e32 v1, v2 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX8-NEXT: s_cbranch_execnz .LBB15_2 +; GFX8-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX8-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX8-NEXT: .LBB15_4: ; %Flow +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v0, s12 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX8-NEXT: .LBB15_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX8-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX8-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: uniform_or_i16: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB15_2 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB15_4 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s8, s2, -4 -; GFX9-NEXT: s_and_b32 s2, s2, 3 -; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: s_lshl_b32 s2, s2, 3 -; GFX9-NEXT: s_and_b32 s3, 0xffff, s6 -; GFX9-NEXT: s_lshl_b32 s3, s3, s2 +; GFX9-NEXT: s_and_b32 s4, s10, -4 +; GFX9-NEXT: s_mov_b32 s5, s11 +; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX9-NEXT: s_and_b32 s0, s10, 3 +; GFX9-NEXT: s_lshl_b32 s13, s0, 3 +; GFX9-NEXT: s_and_b32 s0, 0xffff, s12 +; GFX9-NEXT: s_lshl_b32 s14, s0, s13 +; GFX9-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX9-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v1, v2 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX9-NEXT: s_cbranch_execnz .LBB15_2 +; GFX9-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX9-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX9-NEXT: .LBB15_4: ; %Flow +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, s2, v0 -; GFX9-NEXT: .LBB15_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_or_b32_e32 v0, s4, v0 -; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX9-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: uniform_or_i16: ; GFX1064: ; %bb.0: ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1064-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s12, s[4:5], 0x34 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz .LBB15_2 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB15_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_and_b32 s7, s2, 3 -; GFX1064-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1064-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1064-NEXT: s_and_b32 s8, s2, -4 -; GFX1064-NEXT: v_mov_b32_e32 v0, s9 -; GFX1064-NEXT: s_mov_b32 s10, -1 -; GFX1064-NEXT: s_mov_b32 s9, s3 -; GFX1064-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc +; GFX1064-NEXT: s_and_b32 s4, s10, -4 +; GFX1064-NEXT: s_mov_b32 s5, s11 +; GFX1064-NEXT: s_and_b32 s1, s10, 3 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1064-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1064-NEXT: s_and_b32 s1, 0xffff, s12 +; GFX1064-NEXT: s_mov_b64 s[10:11], 0 +; GFX1064-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s0 +; GFX1064-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, v1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1064-NEXT: .LBB15_2: -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v2 +; GFX1064-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX1064-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1064-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1064-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1064-NEXT: .LBB15_4: ; %Flow +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc -; GFX1064-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc +; GFX1064-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1064-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX1064-NEXT: s_endpgm ; ; GFX1032-LABEL: uniform_or_i16: ; GFX1032: ; %bb.0: ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX1032-NEXT: s_load_dword s6, s[4:5], 0x34 +; GFX1032-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x34 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz .LBB15_2 +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB15_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_and_b32 s5, s2, 3 -; GFX1032-NEXT: s_and_b32 s7, 0xffff, s6 -; GFX1032-NEXT: s_lshl_b32 s5, s5, 3 -; GFX1032-NEXT: s_and_b32 s8, s2, -4 -; GFX1032-NEXT: s_lshl_b32 s7, s7, s5 -; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s7 -; GFX1032-NEXT: s_mov_b32 s10, -1 -; GFX1032-NEXT: s_mov_b32 s9, s3 -; GFX1032-NEXT: buffer_atomic_or v0, off, s[8:11], 0 glc +; GFX1032-NEXT: s_and_b32 s4, s10, -4 +; GFX1032-NEXT: s_mov_b32 s5, s11 +; GFX1032-NEXT: s_and_b32 s6, s10, 3 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX1032-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1032-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s0 +; GFX1032-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, v1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; GFX1032-NEXT: s_waitcnt vmcnt(0) -; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s5, v0 -; GFX1032-NEXT: .LBB15_2: -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v2 +; GFX1032-NEXT: s_or_b32 s3, s0, s3 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1032-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1032-NEXT: .LBB15_4: ; %Flow +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc_lo -; GFX1032-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: buffer_store_short v0, off, s[0:3], 0 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo +; GFX1032-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_short v0, off, s[8:11], 0 ; GFX1032-NEXT: s_endpgm ; ; GFX1164-TRUE16-LABEL: uniform_or_i16: ; GFX1164-TRUE16: ; %bb.0: ; GFX1164-TRUE16-NEXT: s_clause 0x1 -; GFX1164-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1164-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1164-TRUE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1164-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1164-TRUE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-TRUE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-TRUE16-NEXT: ; %bb.1: ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1164-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1164-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1164-TRUE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1164-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1164-TRUE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1164-TRUE16-NEXT: s_and_b32 s1, 0xffff, s12 +; GFX1164-TRUE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1164-TRUE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1164-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-TRUE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-TRUE16-NEXT: .LBB15_2: -; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1164-TRUE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-TRUE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1164-TRUE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1164-TRUE16-NEXT: .LBB15_4: ; %Flow +; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-TRUE16-NEXT: s_mov_b32 s10, -1 ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc +; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1164-TRUE16-NEXT: s_endpgm ; ; GFX1164-FAKE16-LABEL: uniform_or_i16: ; GFX1164-FAKE16: ; %bb.0: ; GFX1164-FAKE16-NEXT: s_clause 0x1 -; GFX1164-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1164-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1164-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1164-FAKE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1164-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1164-FAKE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-FAKE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-FAKE16-NEXT: ; %bb.1: ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1164-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1164-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1164-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1164-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1164-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1164-FAKE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1164-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1164-FAKE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1164-FAKE16-NEXT: s_and_b32 s1, 0xffff, s12 +; GFX1164-FAKE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1164-FAKE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1164-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1164-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1164-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1164-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1164-FAKE16-NEXT: .LBB15_2: -; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1164-FAKE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1164-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1164-FAKE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1164-FAKE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1164-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1164-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1164-FAKE16-NEXT: .LBB15_4: ; %Flow +; GFX1164-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1164-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1164-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1164-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1164-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1164-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1164-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc ; GFX1164-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1164-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX1164-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1164-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1164-FAKE16-NEXT: s_endpgm ; ; GFX1132-TRUE16-LABEL: uniform_or_i16: ; GFX1132-TRUE16: ; %bb.0: ; GFX1132-TRUE16-NEXT: s_clause 0x1 -; GFX1132-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1132-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1132-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1132-TRUE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-TRUE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-TRUE16-NEXT: ; %bb.1: ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1132-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1132-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1132-TRUE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1132-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1132-TRUE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1132-TRUE16-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX1132-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-TRUE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-TRUE16-NEXT: .LBB15_2: -; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1132-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1132-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-TRUE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1132-TRUE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1132-TRUE16-NEXT: .LBB15_4: ; %Flow +; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-TRUE16-NEXT: s_mov_b32 s10, -1 ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo +; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1132-TRUE16-NEXT: s_endpgm ; ; GFX1132-FAKE16-LABEL: uniform_or_i16: ; GFX1132-FAKE16: ; %bb.0: ; GFX1132-FAKE16-NEXT: s_clause 0x1 -; GFX1132-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1132-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1132-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1132-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1132-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1132-FAKE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1132-FAKE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-FAKE16-NEXT: ; %bb.1: ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1132-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1132-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1132-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1132-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], 0 glc +; GFX1132-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1132-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1132-FAKE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1132-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1132-FAKE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1132-FAKE16-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX1132-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1132-FAKE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1132-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1132-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1132-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1132-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1132-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1132-FAKE16-NEXT: .LBB15_2: -; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1132-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1132-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1132-FAKE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1132-FAKE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1132-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1132-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1132-FAKE16-NEXT: .LBB15_4: ; %Flow +; GFX1132-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1132-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX1132-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo +; GFX1132-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1132-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1132-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1132-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo ; GFX1132-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1132-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX1132-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1132-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX1132-FAKE16-NEXT: s_endpgm ; ; GFX1264-TRUE16-LABEL: uniform_or_i16: ; GFX1264-TRUE16: ; %bb.0: ; GFX1264-TRUE16-NEXT: s_clause 0x1 -; GFX1264-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-TRUE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1264-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1264-TRUE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1264-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1264-TRUE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1264-TRUE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1264-TRUE16-NEXT: ; %bb.1: ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-TRUE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1264-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1264-TRUE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1264-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1264-TRUE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1264-TRUE16-NEXT: s_and_b32 s1, 0xffff, s12 +; GFX1264-TRUE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1264-TRUE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1264-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1264-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-TRUE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-TRUE16-NEXT: .LBB15_2: -; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1264-TRUE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-TRUE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264-TRUE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1264-TRUE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1264-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1264-TRUE16-NEXT: .LBB15_4: ; %Flow +; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1264-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s12, 0, vcc +; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1264-TRUE16-NEXT: s_endpgm ; ; GFX1264-FAKE16-LABEL: uniform_or_i16: ; GFX1264-FAKE16: ; %bb.0: ; GFX1264-FAKE16-NEXT: s_clause 0x1 -; GFX1264-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1264-FAKE16-NEXT: s_load_b32 s6, s[4:5], 0x34 +; GFX1264-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1264-FAKE16-NEXT: s_load_b32 s12, s[4:5], 0x34 ; GFX1264-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 ; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1264-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1264-FAKE16-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1264-FAKE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1264-FAKE16-NEXT: ; %bb.1: ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_and_b32 s7, s2, 3 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, 0xffff, s6 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s7, s7, 3 -; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1264-FAKE16-NEXT: s_lshl_b32 s9, s8, s7 -; GFX1264-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v0, s9 -; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1264-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1264-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1264-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1264-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1264-FAKE16-NEXT: s_and_b32 s1, s10, 3 +; GFX1264-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1264-FAKE16-NEXT: s_lshl_b32 s13, s1, 3 +; GFX1264-FAKE16-NEXT: s_and_b32 s1, 0xffff, s12 +; GFX1264-FAKE16-NEXT: s_mov_b64 s[10:11], 0 +; GFX1264-FAKE16-NEXT: s_lshl_b32 s14, s1, s13 +; GFX1264-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1264-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1264-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1264-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s14, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v2, v0 +; GFX1264-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 -; GFX1264-FAKE16-NEXT: .LBB15_2: -; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1264-FAKE16-NEXT: v_cmp_eq_u32_e64 s[0:1], v2, v1 +; GFX1264-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1264-FAKE16-NEXT: s_or_b64 s[10:11], s[0:1], s[10:11] +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1264-FAKE16-NEXT: s_and_not1_b64 exec, exec, s[10:11] +; GFX1264-FAKE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1264-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX1264-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s13, v2 +; GFX1264-FAKE16-NEXT: .LBB15_4: ; %Flow +; GFX1264-FAKE16-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1264-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s6, 0, vcc +; GFX1264-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1264-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1264-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1264-FAKE16-NEXT: v_cndmask_b32_e64 v0, s12, 0, vcc +; GFX1264-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1264-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1264-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1264-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1264-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1264-FAKE16-NEXT: s_endpgm ; ; GFX1232-TRUE16-LABEL: uniform_or_i16: ; GFX1232-TRUE16: ; %bb.0: ; GFX1232-TRUE16-NEXT: s_clause 0x1 -; GFX1232-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1232-TRUE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1232-TRUE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-TRUE16-NEXT: ; implicit-def: $vgpr0_lo16 -; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1232-TRUE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1232-TRUE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1232-TRUE16-NEXT: ; %bb.1: ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-TRUE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-TRUE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-TRUE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-TRUE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-TRUE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1232-TRUE16-NEXT: s_mov_b32 s5, s11 +; GFX1232-TRUE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1232-TRUE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1232-TRUE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1232-TRUE16-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX1232-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-TRUE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1232-TRUE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-TRUE16-NEXT: .LBB15_2: -; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 +; GFX1232-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1232-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-TRUE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1232-TRUE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1232-TRUE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1232-TRUE16-NEXT: .LBB15_4: ; %Flow +; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l -; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1232-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s1, 0, vcc_lo +; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s0, v0.l +; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1232-TRUE16-NEXT: s_endpgm ; ; GFX1232-FAKE16-LABEL: uniform_or_i16: ; GFX1232-FAKE16: ; %bb.0: ; GFX1232-FAKE16-NEXT: s_clause 0x1 -; GFX1232-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX1232-FAKE16-NEXT: s_load_b32 s4, s[4:5], 0x34 +; GFX1232-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 +; GFX1232-FAKE16-NEXT: s_load_b32 s1, s[4:5], 0x34 ; GFX1232-FAKE16-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0 ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1232-FAKE16-NEXT: ; implicit-def: $vgpr0 -; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s5, vcc_lo -; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB15_2 +; GFX1232-FAKE16-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1232-FAKE16-NEXT: s_cbranch_execz .LBB15_4 ; GFX1232-FAKE16-NEXT: ; %bb.1: ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_and_b32 s6, s2, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s7, 0xffff, s4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s6, s6, 3 -; GFX1232-FAKE16-NEXT: s_and_b32 s8, s2, -4 -; GFX1232-FAKE16-NEXT: s_lshl_b32 s7, s7, s6 -; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v0, s7 -; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 -; GFX1232-FAKE16-NEXT: s_mov_b32 s9, s3 -; GFX1232-FAKE16-NEXT: buffer_atomic_or_b32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1232-FAKE16-NEXT: s_and_b32 s4, s10, -4 +; GFX1232-FAKE16-NEXT: s_mov_b32 s5, s11 +; GFX1232-FAKE16-NEXT: s_and_b32 s6, s10, 3 +; GFX1232-FAKE16-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1232-FAKE16-NEXT: s_lshl_b32 s10, s6, 3 +; GFX1232-FAKE16-NEXT: s_and_b32 s6, 0xffff, s1 +; GFX1232-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1232-FAKE16-NEXT: s_lshl_b32 s11, s6, s10 +; GFX1232-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, s0 +; GFX1232-FAKE16-NEXT: .LBB15_2: ; %atomicrmw.start +; GFX1232-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s11, v1 +; GFX1232-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 +; GFX1232-FAKE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-FAKE16-NEXT: s_wait_loadcnt 0x0 -; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 -; GFX1232-FAKE16-NEXT: .LBB15_2: -; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1232-FAKE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v1 +; GFX1232-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX1232-FAKE16-NEXT: s_or_b32 s3, s0, s3 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1232-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3 +; GFX1232-FAKE16-NEXT: s_cbranch_execnz .LBB15_2 +; GFX1232-FAKE16-NEXT: ; %bb.3: ; %atomicrmw.end +; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1232-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s10, v2 +; GFX1232-FAKE16-NEXT: .LBB15_4: ; %Flow +; GFX1232-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX1232-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s4, 0, vcc_lo +; GFX1232-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1232-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX1232-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX1232-FAKE16-NEXT: v_cndmask_b32_e64 v0, s1, 0, vcc_lo +; GFX1232-FAKE16-NEXT: s_wait_alu 0xf1ff ; GFX1232-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s2, v0 -; GFX1232-FAKE16-NEXT: s_mov_b32 s2, -1 -; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], null +; GFX1232-FAKE16-NEXT: v_or_b32_e32 v0, s0, v0 +; GFX1232-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], null ; GFX1232-FAKE16-NEXT: s_endpgm %rmw = atomicrmw or ptr addrspace(1) %uniform.ptr, i16 %val monotonic, align 2 store i16 %rmw, ptr addrspace(1) %result diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll index 1311560715dd..d587c97f4ed7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll @@ -1061,25 +1061,64 @@ define void @flat_atomic_sub_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB30_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB30_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB30_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB30_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1091,9 +1130,22 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB31_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset: @@ -1101,17 +1153,43 @@ define void @flat_atomic_sub_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB31_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB31_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB31_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1122,25 +1200,67 @@ define i32 @flat_atomic_sub_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB32_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB32_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB32_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB32_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1150,29 +1270,69 @@ define i32 @flat_atomic_sub_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB33_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB33_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB33_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB33_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1185,10 +1345,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB34_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1196,10 +1368,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB34_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_scalar: @@ -1207,10 +1391,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB34_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB34_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i32 %in seq_cst ret void @@ -1224,10 +1420,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v2, vcc, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB35_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1237,10 +1445,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB35_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset_scalar: @@ -1248,10 +1468,22 @@ define amdgpu_gfx void @flat_atomic_sub_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB35_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB35_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1264,10 +1496,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1275,10 +1521,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_scalar: @@ -1286,10 +1546,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB36_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1301,12 +1575,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_subrev_i32_e32 v3, vcc, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB37_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1314,12 +1600,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB37_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset_scalar: @@ -1327,10 +1625,24 @@ define amdgpu_gfx i32 @flat_atomic_sub_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB37_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_subrev_u32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB37_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst @@ -1343,9 +1655,22 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB38_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: @@ -1353,17 +1678,43 @@ define void @flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB38_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB38_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB38_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1374,29 +1725,69 @@ define i32 @flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_sub_i32_e32 v0, vcc, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB39_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB39_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_sub v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB39_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_sub_u32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB39_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1411,25 +1802,64 @@ define void @flat_atomic_and_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB40_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB40_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB40_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB40_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1441,9 +1871,22 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB41_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset: @@ -1451,17 +1894,43 @@ define void @flat_atomic_and_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB41_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB41_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB41_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1472,25 +1941,67 @@ define i32 @flat_atomic_and_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB42_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB42_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB42_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB42_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1500,29 +2011,69 @@ define i32 @flat_atomic_and_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_and_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB43_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB43_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB43_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB43_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1535,10 +2086,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB44_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1546,10 +2109,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB44_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_scalar: @@ -1557,10 +2132,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB44_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i32 %in seq_cst ret void @@ -1574,10 +2161,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB45_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1587,10 +2186,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB45_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset_scalar: @@ -1598,10 +2209,22 @@ define amdgpu_gfx void @flat_atomic_and_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB45_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB45_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1614,10 +2237,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB46_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1625,10 +2262,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB46_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_scalar: @@ -1636,10 +2287,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB46_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB46_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i32 %in seq_cst ret i32 %result @@ -1651,12 +2316,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB47_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1664,12 +2341,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB47_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset_scalar: @@ -1677,10 +2366,24 @@ define amdgpu_gfx i32 @flat_atomic_and_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB47_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_and_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB47_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst @@ -1693,9 +2396,22 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB48_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: @@ -1703,17 +2419,43 @@ define void @flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB48_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB48_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB48_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1724,29 +2466,69 @@ define i32 @flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB49_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_and_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB49_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_and v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB49_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_and_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB49_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2532,25 +3314,64 @@ define void @flat_atomic_or_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB60_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB60_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB60_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB60_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2562,9 +3383,22 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB61_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset: @@ -2572,17 +3406,43 @@ define void @flat_atomic_or_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB61_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB61_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB61_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2593,25 +3453,67 @@ define i32 @flat_atomic_or_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB62_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB62_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB62_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB62_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2621,29 +3523,69 @@ define i32 @flat_atomic_or_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_or_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB63_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB63_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB63_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB63_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2656,10 +3598,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB64_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2667,10 +3621,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB64_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_scalar: @@ -2678,10 +3644,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_scalar(ptr inreg %ptr, i32 inre ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB64_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB64_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i32 %in seq_cst ret void @@ -2695,10 +3673,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB65_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2708,10 +3698,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB65_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset_scalar: @@ -2719,10 +3721,22 @@ define amdgpu_gfx void @flat_atomic_or_i32_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB65_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB65_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2735,10 +3749,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB66_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2746,10 +3774,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB66_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_scalar: @@ -2757,10 +3799,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_scalar(ptr inreg %ptr, i32 inreg % ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB66_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB66_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2772,12 +3828,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB67_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2785,12 +3853,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB67_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset_scalar: @@ -2798,10 +3878,24 @@ define amdgpu_gfx i32 @flat_atomic_or_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB67_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_or_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB67_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst @@ -2814,9 +3908,22 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB68_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: @@ -2824,17 +3931,43 @@ define void @flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB68_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB68_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB68_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2845,29 +3978,69 @@ define i32 @flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i32 ; GCN1-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB69_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_or_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB69_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_or v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB69_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_or_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB69_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2882,25 +4055,64 @@ define void @flat_atomic_xor_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB70_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB70_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB70_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB70_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -2912,9 +4124,22 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB71_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset: @@ -2922,17 +4147,43 @@ define void @flat_atomic_xor_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB71_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB71_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB71_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -2943,25 +4194,67 @@ define i32 @flat_atomic_xor_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB72_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB72_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB72_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB72_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -2971,29 +4264,69 @@ define i32 @flat_atomic_xor_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB73_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB73_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB73_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB73_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -3006,10 +4339,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB74_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -3017,10 +4362,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB74_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_scalar: @@ -3028,10 +4385,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_scalar(ptr inreg %ptr, i32 inr ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB74_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB74_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i32 %in seq_cst ret void @@ -3045,10 +4414,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB75_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -3058,10 +4439,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB75_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_noret_offset_scalar: @@ -3069,10 +4462,22 @@ define amdgpu_gfx void @flat_atomic_xor_i32_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB75_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v2, s6, v3 +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB75_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -3085,10 +4490,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB76_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -3096,10 +4515,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB76_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_scalar: @@ -3107,10 +4540,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_scalar(ptr inreg %ptr, i32 inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB76_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB76_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i32 %in seq_cst ret i32 %result @@ -3122,12 +4569,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB77_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -3135,12 +4594,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB77_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset_scalar: @@ -3148,10 +4619,24 @@ define amdgpu_gfx i32 @flat_atomic_xor_i32_ret_offset_scalar(ptr inreg %out, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB77_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_xor_b32_e32 v3, s6, v4 +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB77_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst @@ -3164,9 +4649,22 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB78_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory: @@ -3174,17 +4672,43 @@ define void @flat_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, i32 %i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB78_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_xor_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB78_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB78_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3195,29 +4719,69 @@ define i32 @flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr %out, i3 ; GCN1-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB79_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_xor_b32_e32 v0, v1, v2 +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB79_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_xor v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB79_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_xor_b32_e32 v3, v4, v2 +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB79_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7299,25 +8863,70 @@ define void @flat_atomic_uinc_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB131_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB131_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB131_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB131_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7329,9 +8938,24 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB132_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: @@ -7339,17 +8963,47 @@ define void @flat_atomic_uinc_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB132_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB132_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB132_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7360,25 +9014,73 @@ define i32 @flat_atomic_uinc_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB133_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB133_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB133_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB133_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7388,29 +9090,75 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB134_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB134_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB134_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB134_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7423,10 +9171,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB135_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -7434,10 +9196,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB135_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_scalar: @@ -7445,10 +9221,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB135_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB135_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7462,10 +9252,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, 1, v3 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB136_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -7475,10 +9279,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB136_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_noret_offset_scalar: @@ -7486,10 +9304,24 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB136_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v2, 1, v3 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GCN3-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB136_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7502,10 +9334,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB137_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -7513,10 +9361,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB137_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_scalar: @@ -7524,10 +9388,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB137_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB137_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7539,12 +9419,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; GCN1-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB138_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -7552,12 +9446,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; GCN2-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB138_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset_scalar: @@ -7565,10 +9473,26 @@ define amdgpu_gfx i32 @flat_atomic_uinc_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB138_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v0 +; GCN3-NEXT: v_add_u32_e32 v0, 1, v4 +; GCN3-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB138_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst @@ -7581,9 +9505,24 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB139_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB139_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: @@ -7591,17 +9530,47 @@ define void @flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB139_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB139_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB139_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB139_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7612,29 +9581,75 @@ define i32 @flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[4:5], 0 +; GCN1-NEXT: .LBB140_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; GCN1-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN1-NEXT: s_cbranch_execnz .LBB140_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[4:5], 0 +; GCN2-NEXT: .LBB140_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; GCN2-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN2-NEXT: s_cbranch_execnz .LBB140_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_inc v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[4:5], 0 +; GCN3-NEXT: .LBB140_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_add_u32_e32 v3, 1, v4 +; GCN3-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GCN3-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GCN3-NEXT: s_cbranch_execnz .LBB140_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7649,25 +9664,76 @@ define void @flat_atomic_udec_wrap_i32_noret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB141_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB141_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v4, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB141_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB141_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7679,9 +9745,26 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB142_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset: @@ -7689,17 +9772,51 @@ define void @flat_atomic_udec_wrap_i32_noret_offset(ptr %out, i32 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB142_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB142_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB142_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7710,25 +9827,79 @@ define i32 @flat_atomic_udec_wrap_i32_ret(ptr %ptr, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB143_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v0, v3 ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB143_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v0, v3 ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB143_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB143_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7738,29 +9909,81 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset(ptr %out, i32 %in) { ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB144_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB144_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB144_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB144_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7773,10 +9996,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB145_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -7784,10 +10024,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB145_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_scalar: @@ -7795,10 +10052,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 +; GCN3-NEXT: flat_load_dword v3, v[0:1] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: .LBB145_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB145_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret void @@ -7812,10 +10086,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_addc_u32 s35, s5, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s6 +; GCN1-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v2, vcc, -1, v3 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v3, v2 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB146_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7825,10 +10116,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_addc_u32 s35, s5, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s6 +; GCN2-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v3, v2 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB146_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_noret_offset_scalar: @@ -7836,10 +10144,27 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i32_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s6 +; GCN3-NEXT: .LBB146_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; GCN3-NEXT: v_add_u32_e32 v2, -1, v3 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v3, v2 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB146_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7852,10 +10177,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_mov_b32_e32 v0, s4 ; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: flat_load_dword v0, v[0:1] +; GCN1-NEXT: v_mov_b32_e32 v1, s4 +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: v_mov_b32_e32 v2, s5 +; GCN1-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB147_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7863,10 +10207,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_mov_b32_e32 v0, s4 ; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: flat_load_dword v0, v[0:1] +; GCN2-NEXT: v_mov_b32_e32 v1, s4 +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: v_mov_b32_e32 v2, s5 +; GCN2-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v5, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB147_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_scalar: @@ -7874,10 +10237,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_scalar(ptr inreg %ptr, i32 ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB147_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v5, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB147_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i32 %in seq_cst ret i32 %result @@ -7889,12 +10271,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_add_u32 s34, s4, 16 ; GCN1-NEXT: s_addc_u32 s35, s5, 0 -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_mov_b32_e32 v1, s34 +; GCN1-NEXT: v_mov_b32_e32 v2, s35 +; GCN1-NEXT: flat_load_dword v0, v[1:2] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s6 +; GCN1-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v5, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v5 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN1-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB148_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7902,12 +10301,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_add_u32 s34, s4, 16 ; GCN2-NEXT: s_addc_u32 s35, s5, 0 -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: v_mov_b32_e32 v1, s34 +; GCN2-NEXT: v_mov_b32_e32 v2, s35 +; GCN2-NEXT: flat_load_dword v0, v[1:2] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s6 +; GCN2-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v5, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN2-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB148_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset_scalar: @@ -7915,10 +10331,29 @@ define amdgpu_gfx i32 @flat_atomic_udec_wrap_i32_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN3-NEXT: v_mov_b32_e32 v0, s4 ; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v0, v[0:1] offset:16 +; GCN3-NEXT: v_mov_b32_e32 v1, s4 +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v3, s6 +; GCN3-NEXT: v_mov_b32_e32 v2, s5 +; GCN3-NEXT: .LBB148_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v5, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; GCN3-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; GCN3-NEXT: v_add_u32_e32 v0, -1, v5 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB148_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i32 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst @@ -7931,9 +10366,26 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 ; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v[0:1], v2 +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB149_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v4, v3 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB149_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: @@ -7941,17 +10393,51 @@ define void @flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v[0:1], v2 +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB149_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v4, v3 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB149_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16 +; GCN3-NEXT: flat_load_dword v4, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB149_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB149_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -7962,29 +10448,81 @@ define i32 @flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN1-NEXT: v_add_i32_e32 v0, vcc, 16, v0 -; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN1-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN1-NEXT: v_add_i32_e32 v3, vcc, 16, v0 +; GCN1-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v0, v[3:4] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB150_1: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v1, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN1-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB150_1 +; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; ; GCN2-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN2-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; GCN2-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; GCN2-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v0, v[3:4] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB150_1: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v1, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN2-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB150_1 +; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: flat_atomic_dec v0, v[0:1], v2 offset:16 glc +; GCN3-NEXT: flat_load_dword v3, v[0:1] offset:16 +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB150_1: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v4, v3 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GCN3-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GCN3-NEXT: v_add_u32_e32 v3, -1, v4 +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:16 glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB150_1 +; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v0, v3 ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 23dfe2f70fa7..6bff8f558af7 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -3633,21 +3633,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB30_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: s_cbranch_execnz .LBB30_6 ; GCN1-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB30_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB30_2 -; GCN1-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3673,21 +3692,40 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB30_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: s_cbranch_execnz .LBB30_6 ; GCN2-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB30_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB30_2 -; GCN2-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3711,21 +3749,37 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB30_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: s_cbranch_execnz .LBB30_6 ; GCN3-NEXT: .LBB30_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB30_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB30_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB30_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB30_2 -; GCN3-NEXT: .LBB30_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB30_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3756,21 +3810,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB31_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: s_cbranch_execnz .LBB31_6 ; GCN1-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB31_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB31_2 -; GCN1-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3798,21 +3871,40 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB31_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: s_cbranch_execnz .LBB31_6 ; GCN2-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB31_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB31_2 -; GCN2-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3838,21 +3930,37 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB31_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: s_cbranch_execnz .LBB31_6 ; GCN3-NEXT: .LBB31_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB31_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB31_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB31_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB31_2 -; GCN3-NEXT: .LBB31_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB31_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -3877,41 +3985,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB32_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB32_4 -; GCN1-NEXT: .LBB32_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB32_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB32_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: .LBB32_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB32_2 -; GCN1-NEXT: .LBB32_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB32_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GCN1-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB32_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -3920,41 +4043,56 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB32_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB32_4 -; GCN2-NEXT: .LBB32_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB32_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB32_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB32_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: .LBB32_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB32_2 -; GCN2-NEXT: .LBB32_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB32_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GCN2-NEXT: v_sub_u32_e32 v2, vcc, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB32_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -3969,21 +4107,37 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB32_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: s_cbranch_execnz .LBB32_6 ; GCN3-NEXT: .LBB32_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB32_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB32_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB32_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB32_2 -; GCN3-NEXT: .LBB32_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB32_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4015,21 +4169,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB33_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: s_cbranch_execnz .LBB33_6 ; GCN1-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB33_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB33_2 -; GCN1-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4058,21 +4231,40 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB33_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: s_cbranch_execnz .LBB33_6 ; GCN2-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB33_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB33_2 -; GCN2-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4099,21 +4291,37 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB33_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: s_cbranch_execnz .LBB33_6 ; GCN3-NEXT: .LBB33_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB33_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB33_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB33_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB33_2 -; GCN3-NEXT: .LBB33_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB33_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4144,21 +4352,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB34_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN1-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB34_2 -; GCN1-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB34_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB34_2 +; GCN1-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -4188,21 +4415,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB34_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN2-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB34_2 -; GCN2-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB34_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB34_2 +; GCN2-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -4229,21 +4475,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB34_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB34_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB34_6 ; GCN3-NEXT: .LBB34_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB34_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: .LBB34_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB34_2 -; GCN3-NEXT: .LBB34_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB34_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB34_2 +; GCN3-NEXT: .LBB34_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -4276,21 +4536,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB35_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN1-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GCN1-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB35_2 -; GCN1-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB35_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB35_2 +; GCN1-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec @@ -4322,21 +4601,40 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB35_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN2-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GCN2-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB35_2 -; GCN2-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB35_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB35_2 +; GCN2-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -4365,21 +4663,35 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB35_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB35_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB35_6 ; GCN3-NEXT: .LBB35_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB35_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: .LBB35_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB35_2 -; GCN3-NEXT: .LBB35_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB35_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB35_2 +; GCN3-NEXT: .LBB35_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -4409,20 +4721,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB36_2 +; GCN1-NEXT: s_cbranch_vccz .LBB36_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v8, v1 +; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB36_3 -; GCN1-NEXT: s_branch .LBB36_4 -; GCN1-NEXT: .LBB36_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB36_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB36_6 +; GCN1-NEXT: .LBB36_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB36_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -4438,7 +4769,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4451,20 +4782,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB36_2 +; GCN2-NEXT: s_cbranch_vccz .LBB36_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v8, v1 +; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB36_3 -; GCN2-NEXT: s_branch .LBB36_4 -; GCN2-NEXT: .LBB36_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB36_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB36_6 +; GCN2-NEXT: .LBB36_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB36_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -4479,7 +4829,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4490,20 +4840,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB36_2 +; GCN3-NEXT: s_cbranch_vccz .LBB36_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: .LBB36_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB36_3 -; GCN3-NEXT: s_branch .LBB36_4 -; GCN3-NEXT: .LBB36_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB36_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB36_6 +; GCN3-NEXT: .LBB36_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB36_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB36_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -4516,7 +4880,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB36_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB36_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst @@ -4535,20 +4899,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB37_2 +; GCN1-NEXT: s_cbranch_vccz .LBB37_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v8, v1 +; GCN1-NEXT: v_mov_b32_e32 v7, v0 +; GCN1-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GCN1-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB37_3 -; GCN1-NEXT: s_branch .LBB37_4 -; GCN1-NEXT: .LBB37_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB37_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB37_6 +; GCN1-NEXT: .LBB37_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB37_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec @@ -4564,7 +4947,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -4579,20 +4962,39 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB37_2 +; GCN2-NEXT: s_cbranch_vccz .LBB37_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v8, v1 +; GCN2-NEXT: v_mov_b32_e32 v7, v0 +; GCN2-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GCN2-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB37_3 -; GCN2-NEXT: s_branch .LBB37_4 -; GCN2-NEXT: .LBB37_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB37_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB37_6 +; GCN2-NEXT: .LBB37_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB37_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -4607,7 +5009,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_subb_u32_e32 v4, vcc, v1, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -4620,20 +5022,34 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB37_2 +; GCN3-NEXT: s_cbranch_vccz .LBB37_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: .LBB37_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v8, v1 +; GCN3-NEXT: v_mov_b32_e32 v7, v0 +; GCN3-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GCN3-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB37_3 -; GCN3-NEXT: s_branch .LBB37_4 -; GCN3-NEXT: .LBB37_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB37_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB37_6 +; GCN3-NEXT: .LBB37_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB37_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB37_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -4646,7 +5062,7 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB37_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB37_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -4667,21 +5083,40 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB38_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB38_4 +; GCN1-NEXT: s_cbranch_execnz .LBB38_6 ; GCN1-NEXT: .LBB38_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB38_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB38_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GCN1-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB38_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB38_2 -; GCN1-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB38_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -4709,21 +5144,40 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB38_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB38_4 +; GCN2-NEXT: s_cbranch_execnz .LBB38_6 ; GCN2-NEXT: .LBB38_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB38_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB38_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GCN2-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB38_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB38_2 -; GCN2-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB38_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -4749,21 +5203,37 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB38_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB38_4 +; GCN3-NEXT: s_cbranch_execnz .LBB38_6 ; GCN3-NEXT: .LBB38_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB38_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB38_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB38_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB38_2 -; GCN3-NEXT: .LBB38_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB38_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen @@ -4796,21 +5266,40 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB39_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB39_4 +; GCN1-NEXT: s_cbranch_execnz .LBB39_6 ; GCN1-NEXT: .LBB39_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB39_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB39_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GCN1-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB39_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: ; implicit-def: $vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB39_2 -; GCN1-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB39_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4839,21 +5328,40 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB39_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB39_4 +; GCN2-NEXT: s_cbranch_execnz .LBB39_6 ; GCN2-NEXT: .LBB39_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB39_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB39_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GCN2-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB39_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: ; implicit-def: $vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB39_2 -; GCN2-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB39_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4880,21 +5388,37 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB39_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB39_4 +; GCN3-NEXT: s_cbranch_execnz .LBB39_6 ; GCN3-NEXT: .LBB39_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB39_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB39_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_sub_co_u32_e32 v6, vcc, v8, v2 +; GCN3-NEXT: v_subb_co_u32_e32 v7, vcc, v9, v3, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB39_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2 +; GCN3-NEXT: ; implicit-def: $vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB39_2 -; GCN3-NEXT: .LBB39_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB39_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -4928,21 +5452,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB40_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: s_cbranch_execnz .LBB40_6 ; GCN1-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB40_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB40_2 -; GCN1-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -4968,21 +5511,40 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB40_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: s_cbranch_execnz .LBB40_6 ; GCN2-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB40_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB40_2 -; GCN2-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -5006,21 +5568,37 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB40_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: s_cbranch_execnz .LBB40_6 ; GCN3-NEXT: .LBB40_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB40_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB40_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB40_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB40_2 -; GCN3-NEXT: .LBB40_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB40_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -5051,21 +5629,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB41_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: s_cbranch_execnz .LBB41_6 ; GCN1-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB41_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB41_2 -; GCN1-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -5093,21 +5690,40 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB41_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: s_cbranch_execnz .LBB41_6 ; GCN2-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB41_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB41_2 -; GCN2-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -5133,21 +5749,37 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB41_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: s_cbranch_execnz .LBB41_6 ; GCN3-NEXT: .LBB41_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB41_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB41_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB41_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB41_2 -; GCN3-NEXT: .LBB41_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB41_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -5172,41 +5804,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB42_4 -; GCN1-NEXT: .LBB42_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB42_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB42_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB42_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB42_2 -; GCN1-NEXT: .LBB42_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB42_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_and_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_and_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB42_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5215,41 +5862,56 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB42_4 -; GCN2-NEXT: .LBB42_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB42_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB42_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB42_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB42_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB42_2 -; GCN2-NEXT: .LBB42_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB42_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_and_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_and_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_and_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_and_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB42_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5264,21 +5926,37 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB42_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: s_cbranch_execnz .LBB42_6 ; GCN3-NEXT: .LBB42_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB42_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB42_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB42_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB42_2 -; GCN3-NEXT: .LBB42_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB42_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5310,21 +5988,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB43_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: s_cbranch_execnz .LBB43_6 ; GCN1-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB43_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB43_2 -; GCN1-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -5353,21 +6050,40 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB43_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: s_cbranch_execnz .LBB43_6 ; GCN2-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB43_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB43_2 -; GCN2-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -5394,21 +6110,37 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB43_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: s_cbranch_execnz .LBB43_6 ; GCN3-NEXT: .LBB43_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB43_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB43_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB43_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB43_2 -; GCN3-NEXT: .LBB43_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB43_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -5439,21 +6171,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB44_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN1-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB44_2 -; GCN1-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB44_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB44_2 +; GCN1-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -5482,21 +6232,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB44_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN2-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB44_2 -; GCN2-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB44_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB44_2 +; GCN2-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -5522,21 +6290,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB44_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB44_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB44_6 ; GCN3-NEXT: .LBB44_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB44_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB44_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB44_2 -; GCN3-NEXT: .LBB44_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB44_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB44_2 +; GCN3-NEXT: .LBB44_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -5568,21 +6349,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB45_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN1-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB45_2 -; GCN1-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB45_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB45_2 +; GCN1-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -5613,21 +6412,39 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB45_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN2-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB45_2 -; GCN2-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB45_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB45_2 +; GCN2-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -5655,21 +6472,34 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB45_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB45_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB45_6 ; GCN3-NEXT: .LBB45_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB45_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB45_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_and_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB45_2 -; GCN3-NEXT: .LBB45_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB45_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB45_2 +; GCN3-NEXT: .LBB45_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -5698,20 +6528,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB46_2 +; GCN1-NEXT: s_cbranch_vccz .LBB46_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB46_3 -; GCN1-NEXT: s_branch .LBB46_4 -; GCN1-NEXT: .LBB46_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB46_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB46_6 +; GCN1-NEXT: .LBB46_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB46_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -5726,7 +6574,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5739,20 +6587,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB46_2 +; GCN2-NEXT: s_cbranch_vccz .LBB46_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB46_3 -; GCN2-NEXT: s_branch .LBB46_4 -; GCN2-NEXT: .LBB46_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB46_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB46_6 +; GCN2-NEXT: .LBB46_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB46_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -5766,7 +6632,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5777,20 +6643,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB46_2 +; GCN3-NEXT: s_cbranch_vccz .LBB46_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB46_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB46_3 -; GCN3-NEXT: s_branch .LBB46_4 -; GCN3-NEXT: .LBB46_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB46_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB46_6 +; GCN3-NEXT: .LBB46_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB46_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB46_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -5802,7 +6681,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB46_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB46_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst @@ -5821,20 +6700,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB47_2 +; GCN1-NEXT: s_cbranch_vccz .LBB47_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB47_3 -; GCN1-NEXT: s_branch .LBB47_4 -; GCN1-NEXT: .LBB47_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB47_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB47_6 +; GCN1-NEXT: .LBB47_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB47_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -5849,7 +6746,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -5864,20 +6761,38 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB47_2 +; GCN2-NEXT: s_cbranch_vccz .LBB47_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB47_3 -; GCN2-NEXT: s_branch .LBB47_4 -; GCN2-NEXT: .LBB47_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB47_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB47_6 +; GCN2-NEXT: .LBB47_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB47_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -5891,7 +6806,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_and_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -5904,20 +6819,33 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB47_2 +; GCN3-NEXT: s_cbranch_vccz .LBB47_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB47_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_and_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_and_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB47_3 -; GCN3-NEXT: s_branch .LBB47_4 -; GCN3-NEXT: .LBB47_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB47_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB47_6 +; GCN3-NEXT: .LBB47_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB47_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB47_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -5929,7 +6857,7 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_and_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB47_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB47_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -5950,21 +6878,40 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB48_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB48_4 +; GCN1-NEXT: s_cbranch_execnz .LBB48_6 ; GCN1-NEXT: .LBB48_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB48_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB48_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB48_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB48_2 -; GCN1-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB48_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -5992,21 +6939,40 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB48_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB48_4 +; GCN2-NEXT: s_cbranch_execnz .LBB48_6 ; GCN2-NEXT: .LBB48_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB48_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB48_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB48_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB48_2 -; GCN2-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB48_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -6032,21 +6998,37 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB48_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB48_4 +; GCN3-NEXT: s_cbranch_execnz .LBB48_6 ; GCN3-NEXT: .LBB48_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB48_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB48_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_and_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_and_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB48_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB48_2 -; GCN3-NEXT: .LBB48_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB48_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -6079,21 +7061,40 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB49_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB49_4 +; GCN1-NEXT: s_cbranch_execnz .LBB49_6 ; GCN1-NEXT: .LBB49_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB49_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB49_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB49_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB49_2 -; GCN1-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB49_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -6122,21 +7123,40 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB49_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB49_4 +; GCN2-NEXT: s_cbranch_execnz .LBB49_6 ; GCN2-NEXT: .LBB49_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB49_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB49_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB49_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB49_2 -; GCN2-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB49_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -6163,21 +7183,37 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB49_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB49_4 +; GCN3-NEXT: s_cbranch_execnz .LBB49_6 ; GCN3-NEXT: .LBB49_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB49_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB49_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_and_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_and_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB49_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB49_2 -; GCN3-NEXT: .LBB49_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB49_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8126,21 +9162,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB60_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB60_4 +; GCN1-NEXT: s_cbranch_execnz .LBB60_6 ; GCN1-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB60_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB60_2 -; GCN1-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8166,21 +9221,40 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB60_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB60_4 +; GCN2-NEXT: s_cbranch_execnz .LBB60_6 ; GCN2-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB60_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB60_2 -; GCN2-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8204,21 +9278,37 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB60_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB60_4 +; GCN3-NEXT: s_cbranch_execnz .LBB60_6 ; GCN3-NEXT: .LBB60_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB60_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB60_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB60_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB60_2 -; GCN3-NEXT: .LBB60_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB60_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8249,21 +9339,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB61_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB61_4 +; GCN1-NEXT: s_cbranch_execnz .LBB61_6 ; GCN1-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB61_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB61_2 -; GCN1-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -8291,21 +9400,40 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB61_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB61_4 +; GCN2-NEXT: s_cbranch_execnz .LBB61_6 ; GCN2-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB61_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB61_2 -; GCN2-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -8331,21 +9459,37 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB61_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB61_4 +; GCN3-NEXT: s_cbranch_execnz .LBB61_6 ; GCN3-NEXT: .LBB61_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB61_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB61_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB61_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB61_2 -; GCN3-NEXT: .LBB61_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB61_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -8370,41 +9514,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB62_4 -; GCN1-NEXT: .LBB62_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB62_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB62_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB62_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB62_2 -; GCN1-NEXT: .LBB62_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB62_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_or_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB62_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -8413,41 +9572,56 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB62_4 -; GCN2-NEXT: .LBB62_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB62_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB62_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB62_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB62_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB62_2 -; GCN2-NEXT: .LBB62_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB62_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_or_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_or_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_or_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB62_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -8462,21 +9636,37 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB62_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: s_cbranch_execnz .LBB62_6 ; GCN3-NEXT: .LBB62_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB62_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB62_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB62_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB62_2 -; GCN3-NEXT: .LBB62_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB62_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8508,21 +9698,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB63_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: s_cbranch_execnz .LBB63_6 ; GCN1-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB63_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB63_2 -; GCN1-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -8551,21 +9760,40 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB63_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: s_cbranch_execnz .LBB63_6 ; GCN2-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB63_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB63_2 -; GCN2-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -8592,21 +9820,37 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB63_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: s_cbranch_execnz .LBB63_6 ; GCN3-NEXT: .LBB63_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB63_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB63_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB63_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB63_2 -; GCN3-NEXT: .LBB63_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB63_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -8637,21 +9881,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB64_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN1-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB64_2 -; GCN1-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB64_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB64_2 +; GCN1-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -8680,21 +9942,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB64_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN2-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB64_2 -; GCN2-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB64_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB64_2 +; GCN2-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -8720,21 +10000,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB64_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB64_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB64_6 ; GCN3-NEXT: .LBB64_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB64_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB64_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB64_2 -; GCN3-NEXT: .LBB64_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB64_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB64_2 +; GCN3-NEXT: .LBB64_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -8766,21 +10059,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB65_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN1-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB65_2 -; GCN1-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB65_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB65_2 +; GCN1-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -8811,21 +10122,39 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB65_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN2-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB65_2 -; GCN2-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB65_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB65_2 +; GCN2-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -8853,21 +10182,34 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB65_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB65_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB65_6 ; GCN3-NEXT: .LBB65_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB65_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB65_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_or_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB65_2 -; GCN3-NEXT: .LBB65_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB65_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB65_2 +; GCN3-NEXT: .LBB65_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -8896,20 +10238,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB66_2 +; GCN1-NEXT: s_cbranch_vccz .LBB66_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB66_3 -; GCN1-NEXT: s_branch .LBB66_4 -; GCN1-NEXT: .LBB66_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB66_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB66_6 +; GCN1-NEXT: .LBB66_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB66_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -8924,7 +10284,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -8937,20 +10297,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB66_2 +; GCN2-NEXT: s_cbranch_vccz .LBB66_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB66_3 -; GCN2-NEXT: s_branch .LBB66_4 -; GCN2-NEXT: .LBB66_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB66_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB66_6 +; GCN2-NEXT: .LBB66_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB66_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -8964,7 +10342,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -8975,20 +10353,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB66_2 +; GCN3-NEXT: s_cbranch_vccz .LBB66_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB66_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB66_3 -; GCN3-NEXT: s_branch .LBB66_4 -; GCN3-NEXT: .LBB66_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB66_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB66_6 +; GCN3-NEXT: .LBB66_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB66_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB66_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9000,7 +10391,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB66_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB66_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst @@ -9019,20 +10410,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB67_2 +; GCN1-NEXT: s_cbranch_vccz .LBB67_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB67_3 -; GCN1-NEXT: s_branch .LBB67_4 -; GCN1-NEXT: .LBB67_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB67_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB67_6 +; GCN1-NEXT: .LBB67_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB67_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -9047,7 +10456,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9062,20 +10471,38 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB67_2 +; GCN2-NEXT: s_cbranch_vccz .LBB67_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB67_3 -; GCN2-NEXT: s_branch .LBB67_4 -; GCN2-NEXT: .LBB67_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB67_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB67_6 +; GCN2-NEXT: .LBB67_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB67_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -9089,7 +10516,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_or_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9102,20 +10529,33 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB67_2 +; GCN3-NEXT: s_cbranch_vccz .LBB67_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB67_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_or_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_or_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB67_3 -; GCN3-NEXT: s_branch .LBB67_4 -; GCN3-NEXT: .LBB67_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB67_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB67_6 +; GCN3-NEXT: .LBB67_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB67_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB67_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -9127,7 +10567,7 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_or_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB67_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB67_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -9148,21 +10588,40 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB68_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB68_4 +; GCN1-NEXT: s_cbranch_execnz .LBB68_6 ; GCN1-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB68_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB68_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB68_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB68_2 -; GCN1-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB68_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9190,21 +10649,40 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB68_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB68_4 +; GCN2-NEXT: s_cbranch_execnz .LBB68_6 ; GCN2-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB68_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB68_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB68_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB68_2 -; GCN2-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB68_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9230,21 +10708,37 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB68_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB68_4 +; GCN3-NEXT: s_cbranch_execnz .LBB68_6 ; GCN3-NEXT: .LBB68_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB68_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB68_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_or_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_or_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB68_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB68_2 -; GCN3-NEXT: .LBB68_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB68_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9277,21 +10771,40 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB69_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB69_4 +; GCN1-NEXT: s_cbranch_execnz .LBB69_6 ; GCN1-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB69_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB69_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB69_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB69_2 -; GCN1-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB69_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -9320,21 +10833,40 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB69_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB69_4 +; GCN2-NEXT: s_cbranch_execnz .LBB69_6 ; GCN2-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB69_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB69_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB69_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB69_2 -; GCN2-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB69_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -9361,21 +10893,37 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB69_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB69_4 +; GCN3-NEXT: s_cbranch_execnz .LBB69_6 ; GCN3-NEXT: .LBB69_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB69_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB69_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_or_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_or_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB69_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB69_2 -; GCN3-NEXT: .LBB69_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB69_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9409,21 +10957,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB70_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB70_4 +; GCN1-NEXT: s_cbranch_execnz .LBB70_6 ; GCN1-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB70_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB70_2 -; GCN1-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9449,21 +11016,40 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB70_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB70_4 +; GCN2-NEXT: s_cbranch_execnz .LBB70_6 ; GCN2-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB70_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB70_2 -; GCN2-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9487,21 +11073,37 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB70_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB70_4 +; GCN3-NEXT: s_cbranch_execnz .LBB70_6 ; GCN3-NEXT: .LBB70_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB70_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB70_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB70_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB70_2 -; GCN3-NEXT: .LBB70_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB70_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9532,21 +11134,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB71_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB71_4 +; GCN1-NEXT: s_cbranch_execnz .LBB71_6 ; GCN1-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB71_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB71_2 -; GCN1-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -9574,21 +11195,40 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB71_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB71_4 +; GCN2-NEXT: s_cbranch_execnz .LBB71_6 ; GCN2-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB71_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB71_2 -; GCN2-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -9614,21 +11254,37 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB71_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB71_4 +; GCN3-NEXT: s_cbranch_execnz .LBB71_6 ; GCN3-NEXT: .LBB71_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB71_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB71_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB71_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB71_2 -; GCN3-NEXT: .LBB71_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB71_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -9653,41 +11309,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB72_4 -; GCN1-NEXT: .LBB72_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB72_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB72_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 +; GCN1-NEXT: .LBB72_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB72_2 -; GCN1-NEXT: .LBB72_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB72_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN1-NEXT: v_xor_b32_e32 v2, v4, v2 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_xor_b32_e32 v3, v1, v3 -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB72_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -9696,41 +11367,56 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB72_4 -; GCN2-NEXT: .LBB72_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB72_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB72_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB72_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 +; GCN2-NEXT: .LBB72_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB72_2 -; GCN2-NEXT: .LBB72_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB72_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_xor_b32_e32 v2, v0, v2 +; GCN2-NEXT: v_xor_b32_e32 v2, v4, v2 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_xor_b32_e32 v3, v1, v3 -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: v_xor_b32_e32 v3, v5, v3 +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB72_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -9745,21 +11431,37 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB72_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: s_cbranch_execnz .LBB72_6 ; GCN3-NEXT: .LBB72_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB72_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB72_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB72_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB72_2 -; GCN3-NEXT: .LBB72_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB72_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9791,21 +11493,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB73_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: s_cbranch_execnz .LBB73_6 ; GCN1-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB73_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB73_2 -; GCN1-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -9834,21 +11555,40 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB73_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: s_cbranch_execnz .LBB73_6 ; GCN2-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB73_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB73_2 -; GCN2-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -9875,21 +11615,37 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB73_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: s_cbranch_execnz .LBB73_6 ; GCN3-NEXT: .LBB73_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB73_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB73_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB73_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB73_2 -; GCN3-NEXT: .LBB73_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB73_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -9920,21 +11676,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB74_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN1-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB74_2 -; GCN1-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB74_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB74_2 +; GCN1-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -9963,21 +11737,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB74_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN2-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB74_2 -; GCN2-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB74_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB74_2 +; GCN2-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10003,21 +11795,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB74_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB74_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB74_6 ; GCN3-NEXT: .LBB74_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB74_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB74_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB74_2 -; GCN3-NEXT: .LBB74_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB74_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB74_2 +; GCN3-NEXT: .LBB74_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10049,21 +11854,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB75_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN1-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN1-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB75_2 -; GCN1-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB75_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB75_2 +; GCN1-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10094,21 +11917,39 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB75_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN2-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN2-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB75_2 -; GCN2-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB75_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB75_2 +; GCN2-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 @@ -10136,21 +11977,34 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB75_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB75_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB75_6 ; GCN3-NEXT: .LBB75_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB75_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB75_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v1, s7, v3 +; GCN3-NEXT: v_xor_b32_e32 v0, s6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB75_2 -; GCN3-NEXT: .LBB75_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB75_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB75_2 +; GCN3-NEXT: .LBB75_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v0, s34 @@ -10179,20 +12033,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB76_2 +; GCN1-NEXT: s_cbranch_vccz .LBB76_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB76_3 -; GCN1-NEXT: s_branch .LBB76_4 -; GCN1-NEXT: .LBB76_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB76_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB76_6 +; GCN1-NEXT: .LBB76_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB76_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -10207,7 +12079,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10220,20 +12092,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB76_2 +; GCN2-NEXT: s_cbranch_vccz .LBB76_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB76_3 -; GCN2-NEXT: s_branch .LBB76_4 -; GCN2-NEXT: .LBB76_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB76_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB76_6 +; GCN2-NEXT: .LBB76_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB76_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10247,7 +12137,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10258,20 +12148,33 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB76_2 +; GCN3-NEXT: s_cbranch_vccz .LBB76_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB76_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB76_3 -; GCN3-NEXT: s_branch .LBB76_4 -; GCN3-NEXT: .LBB76_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB76_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB76_6 +; GCN3-NEXT: .LBB76_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB76_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB76_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10283,7 +12186,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB76_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB76_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst @@ -10302,20 +12205,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB77_2 +; GCN1-NEXT: s_cbranch_vccz .LBB77_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN1-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB77_3 -; GCN1-NEXT: s_branch .LBB77_4 -; GCN1-NEXT: .LBB77_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB77_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB77_6 +; GCN1-NEXT: .LBB77_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB77_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -10330,7 +12251,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN1-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -10345,20 +12266,38 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB77_2 +; GCN2-NEXT: s_cbranch_vccz .LBB77_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN2-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB77_3 -; GCN2-NEXT: s_branch .LBB77_4 -; GCN2-NEXT: .LBB77_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB77_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB77_6 +; GCN2-NEXT: .LBB77_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB77_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -10372,7 +12311,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN2-NEXT: v_xor_b32_e32 v5, s7, v1 ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -10385,20 +12324,33 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB77_2 +; GCN3-NEXT: s_cbranch_vccz .LBB77_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB77_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_xor_b32_e32 v5, s7, v7 +; GCN3-NEXT: v_xor_b32_e32 v4, s6, v6 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB77_3 -; GCN3-NEXT: s_branch .LBB77_4 -; GCN3-NEXT: .LBB77_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB77_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB77_6 +; GCN3-NEXT: .LBB77_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB77_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB77_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -10410,7 +12362,7 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GCN3-NEXT: v_xor_b32_e32 v4, s6, v0 ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB77_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB77_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -10431,21 +12383,40 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB78_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB78_4 +; GCN1-NEXT: s_cbranch_execnz .LBB78_6 ; GCN1-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB78_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB78_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN1-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB78_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB78_2 -; GCN1-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB78_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 @@ -10473,21 +12444,40 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB78_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB78_4 +; GCN2-NEXT: s_cbranch_execnz .LBB78_6 ; GCN2-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB78_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB78_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN2-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB78_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB78_2 -; GCN2-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB78_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 @@ -10513,21 +12503,37 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB78_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB78_4 +; GCN3-NEXT: s_cbranch_execnz .LBB78_6 ; GCN3-NEXT: .LBB78_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB78_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB78_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_xor_b32_e32 v5, v7, v3 +; GCN3-NEXT: v_xor_b32_e32 v4, v6, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB78_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB78_2 -; GCN3-NEXT: .LBB78_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB78_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4 @@ -10560,21 +12566,40 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB79_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB79_4 +; GCN1-NEXT: s_cbranch_execnz .LBB79_6 ; GCN1-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB79_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB79_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN1-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB79_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr3 +; GCN1-NEXT: ; implicit-def: $vgpr2 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB79_2 -; GCN1-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB79_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -10603,21 +12628,40 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB79_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB79_4 +; GCN2-NEXT: s_cbranch_execnz .LBB79_6 ; GCN2-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB79_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB79_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN2-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB79_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr3 +; GCN2-NEXT: ; implicit-def: $vgpr2 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB79_2 -; GCN2-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB79_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -10644,21 +12688,37 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB79_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB79_4 +; GCN3-NEXT: s_cbranch_execnz .LBB79_6 ; GCN3-NEXT: .LBB79_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB79_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB79_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_xor_b32_e32 v7, v9, v3 +; GCN3-NEXT: v_xor_b32_e32 v6, v8, v2 +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB79_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr3 +; GCN3-NEXT: ; implicit-def: $vgpr2 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB79_2 -; GCN3-NEXT: .LBB79_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB79_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 @@ -20446,21 +22506,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB131_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB131_4 +; GCN1-NEXT: s_cbranch_execnz .LBB131_6 ; GCN1-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB131_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB131_2 -; GCN1-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20489,21 +22570,42 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB131_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB131_4 +; GCN2-NEXT: s_cbranch_execnz .LBB131_6 ; GCN2-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB131_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB131_2 -; GCN2-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20530,21 +22632,39 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB131_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB131_4 +; GCN3-NEXT: s_cbranch_execnz .LBB131_6 ; GCN3-NEXT: .LBB131_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB131_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB131_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB131_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB131_2 -; GCN3-NEXT: .LBB131_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB131_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20578,21 +22698,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB132_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB132_4 +; GCN1-NEXT: s_cbranch_execnz .LBB132_6 ; GCN1-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB132_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB132_2 -; GCN1-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20623,21 +22764,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB132_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB132_4 +; GCN2-NEXT: s_cbranch_execnz .LBB132_6 ; GCN2-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB132_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB132_2 -; GCN2-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20666,21 +22828,39 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB132_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB132_4 +; GCN3-NEXT: s_cbranch_execnz .LBB132_6 ; GCN3-NEXT: .LBB132_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB132_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB132_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB132_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB132_2 -; GCN3-NEXT: .LBB132_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB132_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20708,44 +22888,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB133_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB133_4 -; GCN1-NEXT: .LBB133_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_cbranch_execz .LBB133_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB133_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN1-NEXT: .LBB133_4: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB133_2 -; GCN1-NEXT: .LBB133_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_cbranch_execz .LBB133_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, 1, v4 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3] ; GCN1-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc -; GCN1-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB133_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -20754,44 +22951,61 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB133_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB133_4 -; GCN2-NEXT: .LBB133_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_cbranch_execz .LBB133_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB133_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB133_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GCN2-NEXT: .LBB133_4: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB133_2 -; GCN2-NEXT: .LBB133_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_cbranch_execz .LBB133_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, 1, v4 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc -; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[2:3] ; GCN2-NEXT: v_cndmask_b32_e32 v3, 0, v6, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v2, 0, v7, vcc -; GCN2-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v2, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB133_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -20806,21 +23020,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB133_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB133_4 +; GCN3-NEXT: s_cbranch_execnz .LBB133_6 ; GCN3-NEXT: .LBB133_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB133_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB133_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB133_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB133_2 -; GCN3-NEXT: .LBB133_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB133_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20855,21 +23087,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB134_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB134_4 +; GCN1-NEXT: s_cbranch_execnz .LBB134_6 ; GCN1-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB134_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB134_2 -; GCN1-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20901,21 +23154,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB134_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB134_4 +; GCN2-NEXT: s_cbranch_execnz .LBB134_6 ; GCN2-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB134_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB134_2 -; GCN2-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20945,21 +23219,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB134_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB134_4 +; GCN3-NEXT: s_cbranch_execnz .LBB134_6 ; GCN3-NEXT: .LBB134_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB134_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB134_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB134_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB134_2 -; GCN3-NEXT: .LBB134_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB134_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -20993,21 +23285,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB135_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN1-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB135_2 -; GCN1-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB135_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB135_2 +; GCN1-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -21039,21 +23352,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB135_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN2-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB135_2 -; GCN2-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB135_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB135_2 +; GCN2-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21082,21 +23416,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB135_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB135_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB135_6 ; GCN3-NEXT: .LBB135_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB135_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB135_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB135_2 -; GCN3-NEXT: .LBB135_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB135_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB135_2 +; GCN3-NEXT: .LBB135_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21131,21 +23481,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN1-NEXT: s_mov_b64 s[36:37], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB136_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN1-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN1-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v4, s34 +; GCN1-NEXT: v_mov_b32_e32 v5, s35 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB136_2 -; GCN1-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB136_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB136_2 +; GCN1-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -21179,21 +23550,42 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN2-NEXT: s_mov_b64 s[36:37], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB136_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN2-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN2-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v4, s34 +; GCN2-NEXT: v_mov_b32_e32 v5, s35 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB136_2 -; GCN2-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB136_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB136_2 +; GCN2-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21224,21 +23616,37 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] ; GCN3-NEXT: s_mov_b64 s[36:37], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB136_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB136_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[36:37] +; GCN3-NEXT: s_cbranch_vccnz .LBB136_6 ; GCN3-NEXT: .LBB136_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB136_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s34 +; GCN3-NEXT: v_mov_b32_e32 v5, s35 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB136_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB136_2 -; GCN3-NEXT: .LBB136_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB136_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB136_2 +; GCN3-NEXT: .LBB136_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21270,20 +23678,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB137_2 +; GCN1-NEXT: s_cbranch_vccz .LBB137_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[34:35], 0 +; GCN1-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB137_3 -; GCN1-NEXT: s_branch .LBB137_4 -; GCN1-NEXT: .LBB137_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_cbranch_execnz .LBB137_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN1-NEXT: s_branch .LBB137_6 +; GCN1-NEXT: .LBB137_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB137_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 @@ -21301,7 +23730,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -21314,20 +23743,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB137_2 +; GCN2-NEXT: s_cbranch_vccz .LBB137_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[34:35], 0 +; GCN2-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB137_3 -; GCN2-NEXT: s_branch .LBB137_4 -; GCN2-NEXT: .LBB137_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_cbranch_execnz .LBB137_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN2-NEXT: s_branch .LBB137_6 +; GCN2-NEXT: .LBB137_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB137_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21344,7 +23794,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -21355,20 +23805,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB137_2 +; GCN3-NEXT: s_cbranch_vccz .LBB137_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[34:35], 0 +; GCN3-NEXT: .LBB137_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB137_3 -; GCN3-NEXT: s_branch .LBB137_4 -; GCN3-NEXT: .LBB137_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_cbranch_execnz .LBB137_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[34:35] +; GCN3-NEXT: s_branch .LBB137_6 +; GCN3-NEXT: .LBB137_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB137_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB137_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21383,7 +23849,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB137_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB137_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst @@ -21402,20 +23868,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_cmp_eq_u32 s35, s36 ; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB138_2 +; GCN1-NEXT: s_cbranch_vccz .LBB138_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s36, s34, 4 +; GCN1-NEXT: s_addc_u32 s37, s35, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s36 +; GCN1-NEXT: v_mov_b32_e32 v1, s37 +; GCN1-NEXT: v_mov_b32_e32 v2, s34 +; GCN1-NEXT: v_mov_b32_e32 v3, s35 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[36:37], 0 +; GCN1-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v1 +; GCN1-NEXT: v_mov_b32_e32 v6, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB138_3 -; GCN1-NEXT: s_branch .LBB138_4 -; GCN1-NEXT: .LBB138_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_cbranch_execnz .LBB138_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN1-NEXT: s_branch .LBB138_6 +; GCN1-NEXT: .LBB138_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB138_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 ; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 @@ -21433,7 +23920,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN1-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -21448,20 +23935,41 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_cmp_eq_u32 s35, s36 ; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB138_2 +; GCN2-NEXT: s_cbranch_vccz .LBB138_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s36, s34, 4 +; GCN2-NEXT: s_addc_u32 s37, s35, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s36 +; GCN2-NEXT: v_mov_b32_e32 v1, s37 +; GCN2-NEXT: v_mov_b32_e32 v2, s34 +; GCN2-NEXT: v_mov_b32_e32 v3, s35 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[36:37], 0 +; GCN2-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v1 +; GCN2-NEXT: v_mov_b32_e32 v6, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB138_3 -; GCN2-NEXT: s_branch .LBB138_4 -; GCN2-NEXT: .LBB138_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_cbranch_execnz .LBB138_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN2-NEXT: s_branch .LBB138_6 +; GCN2-NEXT: .LBB138_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB138_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -21478,7 +23986,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc ; GCN2-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v5, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -21491,20 +23999,36 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: s_cmp_eq_u32 s35, s37 ; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB138_2 +; GCN3-NEXT: s_cbranch_vccz .LBB138_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s34 +; GCN3-NEXT: v_mov_b32_e32 v3, s35 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[36:37], 0 +; GCN3-NEXT: .LBB138_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v7, v1 +; GCN3-NEXT: v_mov_b32_e32 v6, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB138_3 -; GCN3-NEXT: s_branch .LBB138_4 -; GCN3-NEXT: .LBB138_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GCN3-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_cbranch_execnz .LBB138_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[36:37] +; GCN3-NEXT: s_branch .LBB138_6 +; GCN3-NEXT: .LBB138_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB138_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB138_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 ; GCN3-NEXT: s_cselect_b32 s34, s34, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -21519,7 +24043,7 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB138_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB138_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -21540,21 +24064,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB139_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB139_4 +; GCN1-NEXT: s_cbranch_execnz .LBB139_6 ; GCN1-NEXT: .LBB139_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB139_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB139_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB139_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB139_2 -; GCN1-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB139_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21585,21 +24130,42 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB139_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB139_4 +; GCN2-NEXT: s_cbranch_execnz .LBB139_6 ; GCN2-NEXT: .LBB139_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB139_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB139_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB139_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB139_2 -; GCN2-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB139_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21628,21 +24194,39 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB139_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB139_4 +; GCN3-NEXT: s_cbranch_execnz .LBB139_6 ; GCN3-NEXT: .LBB139_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB139_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB139_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GCN3-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB139_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB139_2 -; GCN3-NEXT: .LBB139_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB139_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21678,21 +24262,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB140_3 -; GCN1-NEXT: ; %bb.1: ; %Flow +; GCN1-NEXT: ; %bb.1: ; %Flow3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB140_4 +; GCN1-NEXT: s_cbranch_execnz .LBB140_6 ; GCN1-NEXT: .LBB140_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB140_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[6:7], 0 +; GCN1-NEXT: .LBB140_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN1-NEXT: s_cbranch_execnz .LBB140_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB140_2 -; GCN1-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB140_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21724,21 +24329,42 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB140_3 -; GCN2-NEXT: ; %bb.1: ; %Flow +; GCN2-NEXT: ; %bb.1: ; %Flow3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB140_4 +; GCN2-NEXT: s_cbranch_execnz .LBB140_6 ; GCN2-NEXT: .LBB140_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB140_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[6:7], 0 +; GCN2-NEXT: .LBB140_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN2-NEXT: s_cbranch_execnz .LBB140_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB140_2 -; GCN2-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB140_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21768,21 +24394,39 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB140_3 -; GCN3-NEXT: ; %bb.1: ; %Flow +; GCN3-NEXT: ; %bb.1: ; %Flow3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB140_4 +; GCN3-NEXT: s_cbranch_execnz .LBB140_6 ; GCN3-NEXT: .LBB140_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB140_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[6:7], 0 +; GCN3-NEXT: .LBB140_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 1, v8 +; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v9, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GCN3-NEXT: s_cbranch_execnz .LBB140_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB140_2 -; GCN3-NEXT: .LBB140_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB140_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21817,23 +24461,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB141_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB141_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB141_6 ; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB141_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB141_2 -; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -21861,23 +24528,46 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB141_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB141_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB141_6 ; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB141_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB141_2 -; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -21903,23 +24593,43 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB141_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB141_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB141_6 ; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB141_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB141_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB141_2 -; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB141_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -21952,23 +24662,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB142_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB142_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB142_6 ; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB142_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB142_2 -; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -21998,23 +24731,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB142_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB142_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB142_6 ; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB142_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB142_2 -; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -22042,23 +24798,43 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB142_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB142_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB142_6 ; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB142_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB142_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB142_2 -; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB142_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22087,46 +24863,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN1-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN1-NEXT: v_mov_b32_e32 v5, v1 -; GCN1-NEXT: v_mov_b32_e32 v4, v0 -; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB143_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB143_4 -; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN1-NEXT: s_setpc_b64 s[30:31] -; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN1-NEXT: s_cbranch_execz .LBB143_4 +; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v5, v[4:5] +; GCN1-NEXT: flat_load_dword v4, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB143_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execz .LBB143_2 -; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 -; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: .LBB143_4: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execz .LBB143_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN1-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN1-NEXT: v_add_i32_e32 v1, vcc, 4, v0 +; GCN1-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v4 ; GCN1-NEXT: s_waitcnt vmcnt(0) -; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] ; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN1-NEXT: .LBB143_6: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_mov_b32_e32 v0, v4 +; GCN1-NEXT: v_mov_b32_e32 v1, v5 ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22135,46 +24930,65 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[4:5], 0xe4 ; GCN2-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN2-NEXT: v_mov_b32_e32 v5, v1 -; GCN2-NEXT: v_mov_b32_e32 v4, v0 -; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 +; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB143_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB143_4 -; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN2-NEXT: s_setpc_b64 s[30:31] -; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] +; GCN2-NEXT: s_cbranch_execz .LBB143_4 +; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v5, v[4:5] +; GCN2-NEXT: flat_load_dword v4, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB143_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB143_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execz .LBB143_2 -; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private -; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 -; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: .LBB143_4: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execz .LBB143_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN2-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc +; GCN2-NEXT: v_add_u32_e32 v1, vcc, 4, v0 +; GCN2-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v4 ; GCN2-NEXT: s_waitcnt vmcnt(0) -; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[2:3] +; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v5, s[6:7] ; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen +; GCN2-NEXT: .LBB143_6: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_mov_b32_e32 v0, v4 +; GCN2-NEXT: v_mov_b32_e32 v1, v5 ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22187,23 +25001,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB143_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB143_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB143_6 ; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB143_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB143_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB143_2 -; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB143_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22238,23 +25072,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB144_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB144_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB144_6 ; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB144_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB144_2 -; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -22286,23 +25143,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB144_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB144_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB144_6 ; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB144_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB144_2 -; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -22332,23 +25212,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB144_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB144_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB144_6 ; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB144_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB144_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB144_2 -; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB144_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -22384,21 +25284,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB145_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN1-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v4, s4 +; GCN1-NEXT: v_mov_b32_e32 v5, s5 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB145_2 -; GCN1-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB145_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_branch .LBB145_2 +; GCN1-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -22433,21 +25358,46 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB145_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN2-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v4, s4 +; GCN2-NEXT: v_mov_b32_e32 v5, s5 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB145_2 -; GCN2-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB145_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_branch .LBB145_2 +; GCN2-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -22479,21 +25429,41 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] ; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB145_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB145_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB145_6 ; GCN3-NEXT: .LBB145_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB145_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s4 +; GCN3-NEXT: v_mov_b32_e32 v5, s5 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .LBB145_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB145_2 -; GCN3-NEXT: .LBB145_4: ; %atomicrmw.private +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB145_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_branch .LBB145_2 +; GCN3-NEXT: .LBB145_6: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -22522,34 +25492,59 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s34, s4, 32 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s35, s36 -; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_mov_b64 s[36:37], -1 +; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_mov_b64 s[34:35], -1 ; GCN1-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB146_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN1-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s34, s38, 4 +; GCN1-NEXT: s_addc_u32 s35, s39, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v4, s38 +; GCN1-NEXT: v_mov_b32_e32 v5, s39 +; GCN1-NEXT: flat_load_dword v3, v[0:1] +; GCN1-NEXT: flat_load_dword v2, v[4:5] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: v_mov_b32_e32 v6, s7 +; GCN1-NEXT: v_mov_b32_e32 v7, s6 +; GCN1-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execnz .LBB146_2 -; GCN1-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_mov_b32_e32 v3, v1 +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: v_mov_b32_e32 v2, v0 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB146_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_branch .LBB146_2 +; GCN1-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec -; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s38, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s34 @@ -22573,32 +25568,57 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s35, s36 -; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_mov_b64 s[36:37], -1 +; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_mov_b64 s[34:35], -1 ; GCN2-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB146_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN2-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB146_3: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s38, 4 +; GCN2-NEXT: s_addc_u32 s35, s39, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v4, s38 +; GCN2-NEXT: v_mov_b32_e32 v5, s39 +; GCN2-NEXT: flat_load_dword v3, v[0:1] +; GCN2-NEXT: flat_load_dword v2, v[4:5] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: v_mov_b32_e32 v6, s7 +; GCN2-NEXT: v_mov_b32_e32 v7, s6 +; GCN2-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execnz .LBB146_2 -; GCN2-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_mov_b32_e32 v3, v1 +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: v_mov_b32_e32 v2, v0 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB146_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_branch .LBB146_2 +; GCN2-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN2-NEXT: s_cselect_b32 s34, s38, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s34 @@ -22622,31 +25642,51 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 -; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base -; GCN3-NEXT: s_cmp_eq_u32 s35, s37 -; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_mov_b64 s[36:37], -1 +; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_addc_u32 s39, s5, 0 +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_mov_b64 s[34:35], -1 ; GCN3-NEXT: s_cbranch_vccnz .LBB146_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB146_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_and_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccnz .LBB146_6 ; GCN3-NEXT: .LBB146_2: ; %atomicrmw.phi ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB146_3: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v4, s38 +; GCN3-NEXT: v_mov_b32_e32 v5, s39 +; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: v_mov_b32_e32 v6, s7 +; GCN3-NEXT: v_mov_b32_e32 v7, s6 +; GCN3-NEXT: .LBB146_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execnz .LBB146_2 -; GCN3-NEXT: .LBB146_4: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_mov_b32_e32 v3, v1 +; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: v_mov_b32_e32 v2, v0 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_cbranch_execnz .LBB146_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_branch .LBB146_2 +; GCN3-NEXT: .LBB146_6: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN3-NEXT: s_cselect_b32 s34, s38, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 @@ -22679,20 +25719,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_cmp_eq_u32 s5, s34 ; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN1-NEXT: s_cbranch_vccz .LBB147_2 +; GCN1-NEXT: s_cbranch_vccz .LBB147_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s4 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 -; GCN1-NEXT: v_mov_b32_e32 v1, s5 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_add_u32 s34, s4, 4 +; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: v_mov_b32_e32 v0, s34 +; GCN1-NEXT: v_mov_b32_e32 v1, s35 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[38:39], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB147_3 -; GCN1-NEXT: s_branch .LBB147_4 -; GCN1-NEXT: .LBB147_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_cbranch_execnz .LBB147_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN1-NEXT: s_branch .LBB147_6 +; GCN1-NEXT: .LBB147_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN1-NEXT: s_cbranch_execz .LBB147_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[4:5], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 ; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec @@ -22714,7 +25779,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22727,20 +25792,45 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_cmp_eq_u32 s5, s34 ; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN2-NEXT: s_cbranch_vccz .LBB147_2 +; GCN2-NEXT: s_cbranch_vccz .LBB147_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s4 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 -; GCN2-NEXT: v_mov_b32_e32 v1, s5 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_add_u32 s34, s4, 4 +; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: v_mov_b32_e32 v0, s34 +; GCN2-NEXT: v_mov_b32_e32 v1, s35 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[38:39], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB147_3 -; GCN2-NEXT: s_branch .LBB147_4 -; GCN2-NEXT: .LBB147_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_cbranch_execnz .LBB147_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN2-NEXT: s_branch .LBB147_6 +; GCN2-NEXT: .LBB147_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN2-NEXT: s_cbranch_execz .LBB147_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN2-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 @@ -22761,7 +25851,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22772,20 +25862,40 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: s_cmp_eq_u32 s5, s35 ; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 ; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] -; GCN3-NEXT: s_cbranch_vccz .LBB147_2 +; GCN3-NEXT: s_cbranch_vccz .LBB147_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s4 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s5 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s4 +; GCN3-NEXT: v_mov_b32_e32 v3, s5 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[38:39], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .LBB147_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB147_3 -; GCN3-NEXT: s_branch .LBB147_4 -; GCN3-NEXT: .LBB147_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_cbranch_execnz .LBB147_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[38:39] +; GCN3-NEXT: s_branch .LBB147_6 +; GCN3-NEXT: .LBB147_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB147_3: ; %atomicrmw.private +; GCN3-NEXT: s_cbranch_execz .LBB147_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private ; GCN3-NEXT: s_cmp_lg_u64 s[4:5], 0 ; GCN3-NEXT: s_cselect_b32 s34, s4, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 @@ -22804,7 +25914,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB147_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB147_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -22816,31 +25926,56 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1: ; %bb.0: ; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN1-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN1-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN1-NEXT: s_add_u32 s34, s4, 32 -; GCN1-NEXT: s_addc_u32 s35, s5, 0 +; GCN1-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN1-NEXT: s_add_u32 s38, s4, 32 +; GCN1-NEXT: s_addc_u32 s39, s5, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s35, s36 -; GCN1-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN1-NEXT: s_cbranch_vccz .LBB148_2 +; GCN1-NEXT: s_cmp_eq_u32 s39, s34 +; GCN1-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN1-NEXT: s_cbranch_vccz .LBB148_4 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN1-NEXT: s_add_u32 s34, s38, 4 +; GCN1-NEXT: s_addc_u32 s35, s39, 0 ; GCN1-NEXT: v_mov_b32_e32 v0, s34 -; GCN1-NEXT: v_mov_b32_e32 v2, s6 ; GCN1-NEXT: v_mov_b32_e32 v1, s35 -; GCN1-NEXT: v_mov_b32_e32 v3, s7 -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: v_mov_b32_e32 v2, s38 +; GCN1-NEXT: v_mov_b32_e32 v3, s39 +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[2:3] +; GCN1-NEXT: s_mov_b64 s[40:41], 0 +; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: v_mov_b32_e32 v5, s6 +; GCN1-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol -; GCN1-NEXT: s_cbranch_execz .LBB148_3 -; GCN1-NEXT: s_branch .LBB148_4 -; GCN1-NEXT: .LBB148_2: +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_cbranch_execnz .LBB148_2 +; GCN1-NEXT: ; %bb.3: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN1-NEXT: s_branch .LBB148_6 +; GCN1-NEXT: .LBB148_4: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN1-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[36:37], s[34:35], 0 +; GCN1-NEXT: s_cbranch_execz .LBB148_6 +; GCN1-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN1-NEXT: v_cmp_ne_u64_e64 s[34:35], s[38:39], 0 ; GCN1-NEXT: v_mov_b32_e32 v5, s6 -; GCN1-NEXT: s_and_b64 s[36:37], s[36:37], exec -; GCN1-NEXT: s_cselect_b32 s34, s34, -1 +; GCN1-NEXT: s_and_b64 s[34:35], s[34:35], exec +; GCN1-NEXT: s_cselect_b32 s34, s38, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 ; GCN1-NEXT: v_mov_b32_e32 v3, s34 @@ -22858,7 +25993,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN1-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN1-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22866,29 +26001,54 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2: ; %bb.0: ; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN2-NEXT: s_mov_b64 s[34:35], 0xe4 -; GCN2-NEXT: s_load_dword s36, s[34:35], 0x0 -; GCN2-NEXT: s_add_u32 s34, s4, 32 -; GCN2-NEXT: s_addc_u32 s35, s5, 0 +; GCN2-NEXT: s_load_dword s34, s[34:35], 0x0 +; GCN2-NEXT: s_add_u32 s38, s4, 32 +; GCN2-NEXT: s_addc_u32 s39, s5, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s35, s36 -; GCN2-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN2-NEXT: s_cbranch_vccz .LBB148_2 +; GCN2-NEXT: s_cmp_eq_u32 s39, s34 +; GCN2-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN2-NEXT: s_cbranch_vccz .LBB148_4 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global +; GCN2-NEXT: s_add_u32 s34, s38, 4 +; GCN2-NEXT: s_addc_u32 s35, s39, 0 ; GCN2-NEXT: v_mov_b32_e32 v0, s34 -; GCN2-NEXT: v_mov_b32_e32 v2, s6 ; GCN2-NEXT: v_mov_b32_e32 v1, s35 -; GCN2-NEXT: v_mov_b32_e32 v3, s7 -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: v_mov_b32_e32 v2, s38 +; GCN2-NEXT: v_mov_b32_e32 v3, s39 +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[2:3] +; GCN2-NEXT: s_mov_b64 s[40:41], 0 +; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: v_mov_b32_e32 v5, s6 +; GCN2-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol -; GCN2-NEXT: s_cbranch_execz .LBB148_3 -; GCN2-NEXT: s_branch .LBB148_4 -; GCN2-NEXT: .LBB148_2: +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_cbranch_execnz .LBB148_2 +; GCN2-NEXT: ; %bb.3: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN2-NEXT: s_branch .LBB148_6 +; GCN2-NEXT: .LBB148_4: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN2-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN2-NEXT: s_cselect_b32 s34, s34, -1 +; GCN2-NEXT: s_cbranch_execz .LBB148_6 +; GCN2-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN2-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN2-NEXT: s_cselect_b32 s34, s38, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 ; GCN2-NEXT: v_mov_b32_e32 v3, s34 @@ -22907,35 +26067,55 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[0:3], 0 offen -; GCN2-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN2-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; ; GCN3-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GCN3: ; %bb.0: ; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN3-NEXT: s_add_u32 s34, s4, 32 -; GCN3-NEXT: s_addc_u32 s35, s5, 0 -; GCN3-NEXT: s_mov_b64 s[36:37], src_private_base -; GCN3-NEXT: s_cmp_eq_u32 s35, s37 -; GCN3-NEXT: s_cselect_b64 s[36:37], -1, 0 -; GCN3-NEXT: s_andn2_b64 vcc, exec, s[36:37] -; GCN3-NEXT: s_cbranch_vccz .LBB148_2 +; GCN3-NEXT: s_add_u32 s38, s4, 32 +; GCN3-NEXT: s_addc_u32 s39, s5, 0 +; GCN3-NEXT: s_mov_b64 s[34:35], src_private_base +; GCN3-NEXT: s_cmp_eq_u32 s39, s35 +; GCN3-NEXT: s_cselect_b64 s[34:35], -1, 0 +; GCN3-NEXT: s_andn2_b64 vcc, exec, s[34:35] +; GCN3-NEXT: s_cbranch_vccz .LBB148_4 ; GCN3-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN3-NEXT: v_mov_b32_e32 v0, s34 -; GCN3-NEXT: v_mov_b32_e32 v2, s6 -; GCN3-NEXT: v_mov_b32_e32 v1, s35 -; GCN3-NEXT: v_mov_b32_e32 v3, s7 -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN3-NEXT: v_mov_b32_e32 v2, s38 +; GCN3-NEXT: v_mov_b32_e32 v3, s39 +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; GCN3-NEXT: s_mov_b64 s[40:41], 0 +; GCN3-NEXT: v_mov_b32_e32 v4, s7 +; GCN3-NEXT: v_mov_b32_e32 v5, s6 +; GCN3-NEXT: .LBB148_2: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol -; GCN3-NEXT: s_cbranch_execz .LBB148_3 -; GCN3-NEXT: s_branch .LBB148_4 -; GCN3-NEXT: .LBB148_2: +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[40:41], vcc, s[40:41] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_cbranch_execnz .LBB148_2 +; GCN3-NEXT: ; %bb.3: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[40:41] +; GCN3-NEXT: s_branch .LBB148_6 +; GCN3-NEXT: .LBB148_4: ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GCN3-NEXT: .LBB148_3: ; %atomicrmw.private -; GCN3-NEXT: s_cmp_lg_u64 s[34:35], 0 -; GCN3-NEXT: s_cselect_b32 s34, s34, -1 +; GCN3-NEXT: s_cbranch_execz .LBB148_6 +; GCN3-NEXT: ; %bb.5: ; %atomicrmw.private +; GCN3-NEXT: s_cmp_lg_u64 s[38:39], 0 +; GCN3-NEXT: s_cselect_b32 s34, s38, -1 ; GCN3-NEXT: v_mov_b32_e32 v2, s34 ; GCN3-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 @@ -22952,7 +26132,7 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 -; GCN3-NEXT: .LBB148_4: ; %atomicrmw.end +; GCN3-NEXT: .LBB148_6: ; %atomicrmw.phi ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -22971,23 +26151,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: s_waitcnt lgkmcnt(0) ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB149_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB149_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB149_6 ; GCN1-NEXT: .LBB149_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB149_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN1-NEXT: flat_load_dword v7, v[4:5] +; GCN1-NEXT: flat_load_dword v6, v[0:1] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB149_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GCN1-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN1-NEXT: v_mov_b32_e32 v7, v5 +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: v_mov_b32_e32 v6, v4 +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB149_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB149_2 -; GCN1-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB149_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -23017,23 +26220,46 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: s_waitcnt lgkmcnt(0) ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB149_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB149_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB149_6 ; GCN2-NEXT: .LBB149_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB149_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GCN2-NEXT: flat_load_dword v7, v[4:5] +; GCN2-NEXT: flat_load_dword v6, v[0:1] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB149_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GCN2-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN2-NEXT: v_mov_b32_e32 v7, v5 +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: v_mov_b32_e32 v6, v4 +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB149_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB149_2 -; GCN2-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB149_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -23061,23 +26287,43 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: s_mov_b64 s[4:5], src_private_base ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB149_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB149_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB149_6 ; GCN3-NEXT: .LBB149_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB149_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN3-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB149_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GCN3-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GCN3-NEXT: v_mov_b32_e32 v7, v5 +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: v_mov_b32_e32 v6, v4 +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB149_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB149_2 -; GCN3-NEXT: .LBB149_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB149_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen @@ -23112,23 +26358,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN1-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB150_3 -; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN1-NEXT: s_cbranch_execnz .LBB150_4 +; GCN1-NEXT: ; %bb.1: ; %Flow3 +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN1-NEXT: s_cbranch_execnz .LBB150_6 ; GCN1-NEXT: .LBB150_2: ; %atomicrmw.phi ; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB150_3: ; %atomicrmw.global -; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN1-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN1-NEXT: flat_load_dword v1, v[0:1] +; GCN1-NEXT: flat_load_dword v0, v[4:5] +; GCN1-NEXT: s_mov_b64 s[10:11], 0 +; GCN1-NEXT: .LBB150_4: ; %atomicrmw.start +; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v9, v1 +; GCN1-NEXT: v_mov_b32_e32 v8, v0 +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN1-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN1-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN1-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN1-NEXT: s_cbranch_execnz .LBB150_4 +; GCN1-NEXT: ; %bb.5: ; %Flow +; GCN1-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN1-NEXT: s_cbranch_execz .LBB150_2 -; GCN1-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN1-NEXT: .LBB150_6: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 @@ -23160,23 +26429,46 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: v_cmp_ne_u32_e32 vcc, s4, v5 ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN2-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB150_3 -; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN2-NEXT: s_cbranch_execnz .LBB150_4 +; GCN2-NEXT: ; %bb.1: ; %Flow3 +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN2-NEXT: s_cbranch_execnz .LBB150_6 ; GCN2-NEXT: .LBB150_2: ; %atomicrmw.phi ; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB150_3: ; %atomicrmw.global -; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN2-NEXT: v_add_u32_e32 v0, vcc, 4, v4 +; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GCN2-NEXT: flat_load_dword v1, v[0:1] +; GCN2-NEXT: flat_load_dword v0, v[4:5] +; GCN2-NEXT: s_mov_b64 s[10:11], 0 +; GCN2-NEXT: .LBB150_4: ; %atomicrmw.start +; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v9, v1 +; GCN2-NEXT: v_mov_b32_e32 v8, v0 +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN2-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN2-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN2-NEXT: s_cbranch_execnz .LBB150_4 +; GCN2-NEXT: ; %bb.5: ; %Flow +; GCN2-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN2-NEXT: s_cbranch_execz .LBB150_2 -; GCN2-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN2-NEXT: .LBB150_6: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 @@ -23206,23 +26498,43 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5 ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN3-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB150_3 -; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] -; GCN3-NEXT: s_cbranch_execnz .LBB150_4 +; GCN3-NEXT: ; %bb.1: ; %Flow3 +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] +; GCN3-NEXT: s_cbranch_execnz .LBB150_6 ; GCN3-NEXT: .LBB150_2: ; %atomicrmw.phi ; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB150_3: ; %atomicrmw.global -; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc +; GCN3-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; GCN3-NEXT: s_mov_b64 s[10:11], 0 +; GCN3-NEXT: .LBB150_4: ; %atomicrmw.start +; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN3-NEXT: v_mov_b32_e32 v9, v1 +; GCN3-NEXT: v_mov_b32_e32 v8, v0 +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v8 +; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN3-NEXT: buffer_wbinvl1_vol +; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GCN3-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN3-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GCN3-NEXT: s_cbranch_execnz .LBB150_4 +; GCN3-NEXT: ; %bb.5: ; %Flow +; GCN3-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[8:9] ; GCN3-NEXT: s_cbranch_execz .LBB150_2 -; GCN3-NEXT: .LBB150_4: ; %atomicrmw.private +; GCN3-NEXT: .LBB150_6: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll index fe47461ebf95..36c4c381d1b3 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll @@ -1097,25 +1097,76 @@ define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB30_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB30_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1125,29 +1176,80 @@ define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB31_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB31_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1158,25 +1260,82 @@ define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB32_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB32_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1186,29 +1345,82 @@ define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB33_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB33_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1219,37 +1431,95 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB34_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB34_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1261,13 +1531,32 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2 +; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB35_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar: @@ -1275,25 +1564,60 @@ define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB35_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1304,37 +1628,95 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB36_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB36_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1346,13 +1728,32 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v8, v1 +; GFX7-NEXT: v_mov_b32_e32 v7, v0 +; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7 +; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB37_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar: @@ -1360,25 +1761,60 @@ define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v8, v1 +; GFX8-NEXT: v_mov_b32_e32 v7, v0 +; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB37_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7 +; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1389,29 +1825,80 @@ define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB38_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB38_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -1422,29 +1909,82 @@ define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX7-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2 +; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB39_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB39_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -1459,25 +1999,76 @@ define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB40_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB40_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1487,29 +2078,80 @@ define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB41_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB41_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1520,25 +2162,82 @@ define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB42_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB42_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1548,29 +2247,82 @@ define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_and_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB43_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB43_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1581,37 +2333,92 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB44_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB44_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -1623,13 +2430,31 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB45_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar: @@ -1637,25 +2462,58 @@ define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB45_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1666,37 +2524,92 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB46_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB46_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -1708,13 +2621,31 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB47_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar: @@ -1722,25 +2653,58 @@ define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB47_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_and_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -1751,29 +2715,80 @@ define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB48_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB48_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -1784,29 +2799,82 @@ define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX7-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB49_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_and_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_and_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB49_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -2771,25 +3839,76 @@ define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB60_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB60_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -2799,29 +3918,80 @@ define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB61_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB61_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2832,25 +4002,82 @@ define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB62_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB62_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -2860,29 +4087,82 @@ define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_or_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB63_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB63_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2893,37 +4173,92 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inre ; GFX7-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB64_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB64_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -2935,13 +4270,31 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB65_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar: @@ -2949,25 +4302,58 @@ define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB65_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -2978,37 +4364,92 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg % ; GFX7-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB66_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB66_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3020,13 +4461,31 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB67_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar: @@ -3034,25 +4493,58 @@ define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB67_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_or_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_or_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3063,29 +4555,80 @@ define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB68_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB68_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -3096,29 +4639,82 @@ define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 ; GFX7-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB69_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_or_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB69_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -3133,25 +4729,76 @@ define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB70_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB70_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -3161,29 +4808,80 @@ define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB71_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB71_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3194,25 +4892,82 @@ define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB72_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB72_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3222,29 +4977,82 @@ define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB73_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB73_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3255,37 +5063,92 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inr ; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB74_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB74_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -3297,13 +5160,31 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB75_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar: @@ -3311,25 +5192,58 @@ define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB75_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3340,37 +5254,92 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg ; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB76_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB76_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -3382,13 +5351,31 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB77_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar: @@ -3396,25 +5383,58 @@ define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB77_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7 +; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -3425,29 +5445,80 @@ define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, ; GFX7-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB78_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB78_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -3458,29 +5529,82 @@ define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i6 ; GFX7-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB79_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3 +; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2 +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB79_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8476,25 +10600,85 @@ define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB131_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB131_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8504,29 +10688,89 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB132_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB132_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8537,25 +10781,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB133_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB133_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8565,29 +10875,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB134_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB134_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8598,37 +10970,101 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB135_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB135_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8640,13 +11076,34 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB136_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: @@ -8654,25 +11111,64 @@ define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB136_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8683,37 +11179,101 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[34:35], 0 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB137_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[34:35], 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB137_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8725,13 +11285,34 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[34:35], 0 +; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX7-NEXT: s_cbranch_execnz .LBB138_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: @@ -8739,25 +11320,64 @@ define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[34:35], 0 +; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX8-NEXT: s_cbranch_execnz .LBB138_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8768,29 +11388,89 @@ define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB139_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB139_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB139_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8801,29 +11481,91 @@ define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7-NEXT: s_cbranch_execnz .LBB140_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[4:5], 0 +; GFX8-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_cbranch_execnz .LBB140_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB140_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -8838,25 +11580,91 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v6, v[0:1] +; GFX7-NEXT: flat_load_dword v7, v[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB141_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v6, v[0:1] +; GFX8-NEXT: flat_load_dword v7, v[4:5] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB141_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -8866,29 +11674,95 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB142_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB142_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB142_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8899,25 +11773,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0 +; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX7-NEXT: flat_load_dword v4, v[0:1] +; GFX7-NEXT: flat_load_dword v5, v[5:6] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v7, v5 +; GFX7-NEXT: v_mov_b32_e32 v6, v4 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB143_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v0, v4 +; GFX7-NEXT: v_mov_b32_e32 v1, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: flat_load_dword v5, v[5:6] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v7, v5 +; GFX8-NEXT: v_mov_b32_e32 v6, v4 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB143_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v0, v4 +; GFX8-NEXT: v_mov_b32_e32 v1, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB143_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -8927,29 +11873,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB144_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB144_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -8960,37 +11974,113 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v3, s34 +; GFX7-NEXT: v_mov_b32_e32 v4, s35 +; GFX7-NEXT: flat_load_dword v2, v[0:1] +; GFX7-NEXT: flat_load_dword v3, v[3:4] +; GFX7-NEXT: v_mov_b32_e32 v4, s4 +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: v_mov_b32_e32 v5, s5 +; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB145_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s34 +; GFX8-NEXT: v_mov_b32_e32 v4, s35 +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 +; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB145_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret void @@ -9002,13 +12092,38 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 -; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 -; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 +; GFX7-NEXT: v_mov_b32_e32 v4, s34 +; GFX7-NEXT: v_mov_b32_e32 v5, s35 +; GFX7-NEXT: flat_load_dword v3, v[0:1] +; GFX7-NEXT: flat_load_dword v2, v[4:5] +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v6, s7 +; GFX7-NEXT: v_mov_b32_e32 v7, s6 +; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, v1 +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: v_mov_b32_e32 v2, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB146_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: @@ -9016,25 +12131,72 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 +; GFX8-NEXT: v_mov_b32_e32 v4, s34 +; GFX8-NEXT: v_mov_b32_e32 v5, s35 +; GFX8-NEXT: flat_load_dword v3, v[0:1] +; GFX8-NEXT: flat_load_dword v2, v[4:5] +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v7, s6 +; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v3, v1 +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB146_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v7, s6 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -9045,37 +12207,113 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 +; GFX7-NEXT: s_add_u32 s34, s4, 4 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s34 +; GFX7-NEXT: v_mov_b32_e32 v3, s35 +; GFX7-NEXT: flat_load_dword v0, v[0:1] +; GFX7-NEXT: flat_load_dword v1, v[2:3] ; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB147_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_add_u32 s34, s4, 4 +; GFX8-NEXT: v_mov_b32_e32 v1, s5 +; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s34 +; GFX8-NEXT: v_mov_b32_e32 v3, s35 +; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: flat_load_dword v1, v[2:3] ; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB147_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1 ret i64 %result @@ -9087,13 +12325,38 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_add_u32 s34, s4, 32 ; GFX7-NEXT: s_addc_u32 s35, s5, 0 +; GFX7-NEXT: s_add_u32 s36, s4, 36 +; GFX7-NEXT: s_addc_u32 s37, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s36 +; GFX7-NEXT: v_mov_b32_e32 v1, s37 ; GFX7-NEXT: v_mov_b32_e32 v2, s34 -; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_mov_b32_e32 v1, s7 ; GFX7-NEXT: v_mov_b32_e32 v3, s35 -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[2:3] +; GFX7-NEXT: s_mov_b64 s[38:39], 0 +; GFX7-NEXT: v_mov_b32_e32 v4, s7 +; GFX7-NEXT: v_mov_b32_e32 v5, s6 +; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX7-NEXT: s_cbranch_execnz .LBB148_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: @@ -9101,25 +12364,72 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_add_u32 s34, s4, 32 ; GFX8-NEXT: s_addc_u32 s35, s5, 0 +; GFX8-NEXT: s_add_u32 s36, s4, 36 +; GFX8-NEXT: s_addc_u32 s37, s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s36 +; GFX8-NEXT: v_mov_b32_e32 v1, s37 ; GFX8-NEXT: v_mov_b32_e32 v2, s34 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v3, s35 -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[2:3] +; GFX8-NEXT: s_mov_b64 s[38:39], 0 +; GFX8-NEXT: v_mov_b32_e32 v4, s7 +; GFX8-NEXT: v_mov_b32_e32 v5, s6 +; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX8-NEXT: s_cbranch_execnz .LBB148_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v4, s7 +; GFX9-NEXT: v_mov_b32_e32 v5, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1 @@ -9130,29 +12440,95 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX7-NEXT: flat_load_dword v7, v[0:1] +; GFX7-NEXT: flat_load_dword v6, v[8:9] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v7, v1 +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: v_mov_b32_e32 v6, v0 +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB149_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GFX8-NEXT: flat_load_dword v7, v[0:1] +; GFX8-NEXT: flat_load_dword v6, v[8:9] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v7, v1 +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v6, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB149_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] offset:32 +; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB149_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 @@ -9163,29 +12539,97 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0 +; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0 +; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX7-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: flat_load_dword v0, v[4:5] +; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mov_b32_e32 v9, v1 +; GFX7-NEXT: v_mov_b32_e32 v8, v0 +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8 +; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX7-NEXT: s_cbranch_execnz .LBB150_1 +; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: flat_load_dword v0, v[4:5] +; GFX8-NEXT: s_mov_b64 s[8:9], 0 +; GFX8-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, v1 +; GFX8-NEXT: v_mov_b32_e32 v8, v0 +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_cbranch_execnz .LBB150_1 +; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc +; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB150_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll index a867c6c1affb..978595feaa28 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll @@ -1284,26 +1284,68 @@ define void @global_atomic_sub_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1317,9 +1359,25 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1328,17 +1386,43 @@ define void @global_atomic_sub_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1353,27 +1437,71 @@ define i32 @global_atomic_sub_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1387,29 +1515,72 @@ define i32 @global_atomic_sub_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1421,23 +1592,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1447,20 +1632,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1471,23 +1680,37 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1499,20 +1722,44 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v2, vcc, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_u32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1524,23 +1771,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1550,20 +1811,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1574,23 +1861,37 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_subrev_i32_e32 v1, vcc, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1600,22 +1901,46 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_subrev_u32_e32 v3, vcc, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst @@ -1667,9 +1992,25 @@ define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v3, vcc, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1678,17 +2019,43 @@ define void @global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v3, vcc, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1703,29 +2070,72 @@ define i32 @global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB40_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_sub_u32_e32 v0, vcc, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_sub_u32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1744,26 +2154,68 @@ define void @global_atomic_and_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1777,9 +2229,25 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1788,17 +2256,43 @@ define void @global_atomic_and_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1813,27 +2307,71 @@ define i32 @global_atomic_and_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -1847,29 +2385,72 @@ define i32 @global_atomic_and_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1881,23 +2462,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1907,20 +2502,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -1931,23 +2550,37 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_and_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1959,20 +2592,44 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -1984,23 +2641,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2010,20 +2681,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -2034,23 +2731,37 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2060,22 +2771,46 @@ define amdgpu_gfx i32 @global_atomic_and_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_and_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_and v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst @@ -2090,9 +2825,25 @@ define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2101,17 +2852,43 @@ define void @global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2126,29 +2903,72 @@ define i32 @global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB50_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB50_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB50_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_and_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB50_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_and_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB50_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3030,26 +3850,68 @@ define void @global_atomic_or_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3063,9 +3925,25 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3074,17 +3952,43 @@ define void @global_atomic_or_i32_noret_offset(ptr addrspace(1) %out, i32 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3099,27 +4003,71 @@ define i32 @global_atomic_or_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3133,29 +4081,72 @@ define i32 @global_atomic_or_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3167,23 +4158,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 +; SI-NEXT: v_or_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3193,20 +4198,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3217,23 +4246,37 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_or_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3245,20 +4288,44 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3270,23 +4337,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3296,20 +4377,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_scalar(ptr addrspace(1) inreg %p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_or_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3320,23 +4427,37 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3346,22 +4467,46 @@ define amdgpu_gfx i32 @global_atomic_or_i32_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_or_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_or v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_or_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst @@ -3413,9 +4558,25 @@ define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addr ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3424,17 +4585,43 @@ define void @global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3449,29 +4636,72 @@ define i32 @global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory(ptr addrspa ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_or_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_or_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3490,26 +4720,68 @@ define void @global_atomic_xor_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3523,9 +4795,25 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3534,17 +4822,43 @@ define void @global_atomic_xor_i32_noret_offset(ptr addrspace(1) %out, i32 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3559,27 +4873,71 @@ define i32 @global_atomic_xor_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3593,29 +4951,72 @@ define i32 @global_atomic_xor_i32_ret_offset(ptr addrspace(1) %out, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_xor_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3627,23 +5028,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 +; SI-NEXT: v_xor_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3653,20 +5068,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -3677,23 +5116,37 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_xor_b32_e32 v0, s34, v1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3705,20 +5158,44 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v2, s6, v3 +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v1 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3730,23 +5207,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3756,20 +5247,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_scalar(ptr addrspace(1) inreg % ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_xor_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -3780,23 +5297,37 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, s34, v2 +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3806,22 +5337,46 @@ define amdgpu_gfx i32 @global_atomic_xor_i32_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_xor_b32_e32 v3, s6, v4 +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_xor_b32_e32 v2, s6, v3 +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst @@ -3873,9 +5428,25 @@ define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB81_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, v4, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB81_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3884,17 +5455,43 @@ define void @global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB81_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v3, v4, v2 +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB81_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB81_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3909,29 +5506,72 @@ define i32 @global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB82_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v4, v5, v2 +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB82_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB82_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_xor_b32_e32 v0, v1, v2 +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB82_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_xor_b32_e32 v3, v4, v2 +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB82_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8434,26 +10074,74 @@ define void @global_atomic_uinc_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB134_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB134_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8467,9 +10155,27 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB135_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8478,17 +10184,47 @@ define void @global_atomic_uinc_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB135_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8503,27 +10239,77 @@ define i32 @global_atomic_uinc_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB136_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB136_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8537,29 +10323,78 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB137_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB137_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8571,23 +10406,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB138_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8597,20 +10448,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB138_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8621,23 +10500,39 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v4, s6, 0 +; SI-NEXT: v_writelane_b32 v4, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB139_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v1 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, v1 +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB139_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v4, 1 +; SI-NEXT: v_readlane_b32 s6, v4, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8649,20 +10544,48 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB139_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, 1, v3 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB139_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v0, 1, v1 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v1 +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB139_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8674,23 +10597,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB140_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB140_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8700,20 +10639,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB140_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB140_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB140_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8724,23 +10693,39 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v3, s6, 0 +; SI-NEXT: v_writelane_b32 v3, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v2, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_cmp_gt_u32_e32 vcc, s34, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, v2 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2 +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB141_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v3, 1 +; SI-NEXT: v_readlane_b32 s6, v3, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -8750,22 +10735,50 @@ define amdgpu_gfx i32 @global_atomic_uinc_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v4 +; VI-NEXT: v_cmp_gt_u32_e32 vcc, s6, v4 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB141_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_inc v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 1, v3 +; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, s6, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8780,9 +10793,27 @@ define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(p ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB142_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v4 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB142_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8791,17 +10822,47 @@ define void @global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory(p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB142_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v4 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; VI-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB142_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB142_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8816,29 +10877,78 @@ define i32 @global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB143_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, 1, v5 +; SI-NEXT: v_cmp_lt_u32_e32 vcc, v5, v2 +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB143_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB143_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v1 +; VI-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB143_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_add_u32_e32 v3, 1, v4 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v4, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB143_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -8853,30 +10963,84 @@ define void @global_atomic_udec_wrap_i32_noret(ptr addrspace(1) %ptr, i32 %in) { ; SI-LABEL: global_atomic_udec_wrap_i32_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB144_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB144_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off +; GFX9-NEXT: global_load_dword v4, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -8886,13 +11050,33 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB145_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -8901,17 +11085,51 @@ define void @global_atomic_udec_wrap_i32_noret_offset(ptr addrspace(1) %out, i32 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB145_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8922,31 +11140,87 @@ define i32 @global_atomic_udec_wrap_i32_ret(ptr addrspace(1) %ptr, i32 %in) { ; SI-LABEL: global_atomic_udec_wrap_i32_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB146_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB146_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] +; VI-NEXT: v_mov_b32_e32 v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -8956,33 +11230,88 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset(ptr addrspace(1) %out, i32 %i ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB147_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB147_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -8994,23 +11323,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB148_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9020,20 +11368,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB148_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret void @@ -9044,23 +11426,42 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB149_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v1 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v1 +; SI-NEXT: v_mov_b32_e32 v3, v0 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB149_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9072,20 +11473,54 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v0, s34 ; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v3, v[0:1] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v4, s6 +; VI-NEXT: .LBB149_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v2, vcc, -1, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v3 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: v_mov_b32_e32 v3, v2 +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB149_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v1, s[4:5] offset:16 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v1 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v1 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB149_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -9097,23 +11532,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB150_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB150_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9123,20 +11577,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_scalar(ptr addrspace(1) i ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: .LBB150_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB150_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB150_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %in seq_cst ret i32 %result @@ -9147,23 +11637,42 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v1, s6, 0 -; SI-NEXT: v_writelane_b32 v1, s7, 1 +; SI-NEXT: v_writelane_b32 v5, s6, 0 +; SI-NEXT: v_writelane_b32 v5, s7, 1 ; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s34 +; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v2, s34 +; SI-NEXT: .LBB151_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_lt_u32_e64 s[36:37], s34, v4 +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; SI-NEXT: v_mov_b32_e32 v0, v3 +; SI-NEXT: v_mov_b32_e32 v1, v4 +; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v1, 1 -; SI-NEXT: v_readlane_b32 s6, v1, 0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB151_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v5, 1 +; SI-NEXT: v_readlane_b32 s6, v5, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9173,22 +11682,56 @@ define amdgpu_gfx i32 @global_atomic_udec_wrap_i32_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 16 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v0, s34 -; VI-NEXT: v_mov_b32_e32 v1, s35 -; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: v_mov_b32_e32 v1, s34 +; VI-NEXT: v_mov_b32_e32 v2, s35 +; VI-NEXT: flat_load_dword v0, v[1:2] +; VI-NEXT: s_mov_b64 s[36:37], 0 +; VI-NEXT: v_mov_b32_e32 v3, s6 +; VI-NEXT: .LBB151_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v5 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; VI-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v5 +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v4, v0, v3, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[1:2], v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5 +; VI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; VI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; VI-NEXT: s_cbranch_execnz .LBB151_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[36:37] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-NEXT: global_atomic_dec v0, v0, v1, s[4:5] offset:16 glc +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: global_load_dword v0, v1, s[4:5] offset:16 +; GFX9-NEXT: s_mov_b64 s[36:37], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: .LBB151_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_lt_u32_e64 s[34:35], s6, v4 +; GFX9-NEXT: v_add_u32_e32 v0, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v0, v1, v[3:4], s[4:5] offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[36:37] +; GFX9-NEXT: s_cbranch_execnz .LBB151_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[36:37] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst @@ -9199,13 +11742,33 @@ define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(p ; SI-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB152_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v4 +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB152_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9214,17 +11777,51 @@ define void @global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory(p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v[0:1], v2 +; VI-NEXT: flat_load_dword v4, v[0:1] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB152_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: v_mov_b32_e32 v4, v3 +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB152_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v[0:1], v2, off offset:16 +; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB152_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB152_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9235,33 +11832,88 @@ define i32 @global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory(ptr ; SI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:16 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB153_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cmp_gt_u32_e64 s[4:5], v5, v2 +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v4, v3, v2, vcc +; SI-NEXT: v_mov_b32_e32 v3, v4 +; SI-NEXT: v_mov_b32_e32 v4, v5 +; SI-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[8:11], 0 addr64 offset:16 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB153_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: v_mov_b32_e32 v0, v3 ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc +; VI-NEXT: v_add_u32_e32 v3, vcc, 16, v0 +; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dword v0, v[3:4] +; VI-NEXT: s_mov_b64 s[6:7], 0 +; VI-NEXT: .LBB153_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v1 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; VI-NEXT: v_cmp_gt_u32_e64 s[4:5], v1, v2 +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; VI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; VI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; VI-NEXT: s_cbranch_execnz .LBB153_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[6:7] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i32_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec v0, v[0:1], v2, off offset:16 glc +; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16 +; GFX9-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-NEXT: .LBB153_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-NEXT: v_cmp_gt_u32_e64 s[4:5], v4, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -1, v4 +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:16 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_cbranch_execnz .LBB153_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_mov_b32_e32 v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i32, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll index a7f16449f058..9a9df43695dd 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll @@ -1329,26 +1329,76 @@ define void @global_atomic_sub_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB30_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB30_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB30_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB30_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB30_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1362,9 +1412,29 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB31_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB31_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1373,17 +1443,47 @@ define void @global_atomic_sub_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB31_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB31_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB31_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1394,32 +1494,88 @@ define i64 @global_atomic_sub_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_sub_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB32_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB32_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB32_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB32_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB32_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1429,34 +1585,88 @@ define i64 @global_atomic_sub_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_sub_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB33_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB33_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB33_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB33_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB33_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1468,25 +1678,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB34_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB34_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1494,24 +1722,54 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_sub_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB34_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB34_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB34_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1522,23 +1780,43 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB35_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, s34, v2 +; SI-NEXT: v_subb_u32_e32 v1, vcc, v3, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v8, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v6, v1 +; SI-NEXT: v_mov_b32_e32 v5, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[5:6], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v5 +; SI-NEXT: v_mov_b32_e32 v3, v6 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB35_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1548,24 +1826,52 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: .LBB35_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2 +; VI-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB35_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v5, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB35_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1577,25 +1883,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB36_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB36_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1603,24 +1927,54 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_sub_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB36_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v1 +; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB36_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB36_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw sub ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1631,23 +1985,43 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v9, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v9, s6, 0 +; SI-NEXT: v_writelane_b32 v9, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: .LBB37_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v8, v1 +; SI-NEXT: v_mov_b32_e32 v7, v0 +; SI-NEXT: v_subrev_i32_e32 v5, vcc, s34, v7 +; SI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v5 +; SI-NEXT: v_mov_b32_e32 v1, v6 +; SI-NEXT: v_mov_b32_e32 v2, v7 +; SI-NEXT: v_mov_b32_e32 v3, v8 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB37_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v9, 1 +; SI-NEXT: v_readlane_b32 s6, v9, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v9, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1658,23 +2032,51 @@ define amdgpu_gfx i64 @global_atomic_sub_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: .LBB37_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v8, v1 +; VI-NEXT: v_mov_b32_e32 v7, v0 +; VI-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7 +; VI-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB37_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v1 +; GFX9-NEXT: v_mov_b32_e32 v6, v0 +; GFX9-NEXT: v_subrev_co_u32_e32 v4, vcc, s6, v6 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[4:7], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB37_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst @@ -1689,9 +2091,29 @@ define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB38_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_sub_i32_e32 v4, vcc, v6, v2 +; SI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB38_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1700,17 +2122,47 @@ define void @global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB38_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_sub_u32_e32 v4, vcc, v6, v2 +; VI-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB38_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB38_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1721,34 +2173,88 @@ define i64 @global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_sub_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB39_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_sub_i32_e32 v8, vcc, v10, v7 +; SI-NEXT: v_subb_u32_e32 v9, vcc, v11, v6, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB39_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB39_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_sub_u32_e32 v6, vcc, v8, v2 +; VI-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB39_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2 +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB39_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw sub ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -1767,26 +2273,76 @@ define void @global_atomic_and_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB40_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB40_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB40_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB40_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB40_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1800,9 +2356,29 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB41_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB41_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -1811,17 +2387,47 @@ define void @global_atomic_and_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB41_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB41_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB41_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -1832,32 +2438,88 @@ define i64 @global_atomic_and_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_and_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB42_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v6 +; SI-NEXT: v_and_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB42_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB42_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB42_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB42_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -1867,34 +2529,88 @@ define i64 @global_atomic_and_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_and_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB43_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v6 +; SI-NEXT: v_and_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB43_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB43_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v7, v9, v3 +; VI-NEXT: v_and_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB43_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB43_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -1906,25 +2622,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB44_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: v_and_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB44_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1932,24 +2665,52 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_and_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB44_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB44_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB44_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -1960,23 +2721,42 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB45_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_and_b32_e32 v1, s34, v3 +; SI-NEXT: v_and_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB45_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -1986,24 +2766,50 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB45_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v1, s7, v3 +; VI-NEXT: v_and_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB45_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_and_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB45_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2015,25 +2821,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB46_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v5 +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB46_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2041,24 +2864,52 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_and_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB46_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v5, s7, v7 +; VI-NEXT: v_and_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB46_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB46_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw and ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -2069,23 +2920,42 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB47_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_and_b32_e32 v3, s34, v5 +; SI-NEXT: v_and_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB47_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -2096,23 +2966,49 @@ define amdgpu_gfx i64 @global_atomic_and_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB47_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_and_b32_e32 v5, s7, v7 +; VI-NEXT: v_and_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB47_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB47_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst @@ -2127,9 +3023,29 @@ define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB48_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_and_b32_e32 v5, v7, v3 +; SI-NEXT: v_and_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB48_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -2138,17 +3054,47 @@ define void @global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB48_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_and_b32_e32 v5, v7, v3 +; VI-NEXT: v_and_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB48_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB48_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -2159,34 +3105,88 @@ define i64 @global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_and_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB49_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_and_b32_e32 v9, v11, v6 +; SI-NEXT: v_and_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB49_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB49_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_and_b32_e32 v7, v9, v3 +; VI-NEXT: v_and_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB49_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_and_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_and_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_and_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB49_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw and ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3197,26 +4197,76 @@ define void @global_atomic_or_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB60_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB60_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB60_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB60_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB60_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3230,9 +4280,29 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB61_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB61_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3241,17 +4311,47 @@ define void @global_atomic_or_i64_noret_offset(ptr addrspace(1) %out, i64 %in) { ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB61_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB61_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB61_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3262,32 +4362,88 @@ define i64 @global_atomic_or_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_or_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB62_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v6 +; SI-NEXT: v_or_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB62_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB62_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB62_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB62_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3297,34 +4453,88 @@ define i64 @global_atomic_or_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_or_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB63_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v6 +; SI-NEXT: v_or_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB63_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB63_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v9, v3 +; VI-NEXT: v_or_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB63_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB63_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3336,25 +4546,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB64_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: v_or_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB64_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3362,24 +4589,52 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg ; VI-LABEL: global_atomic_or_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB64_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB64_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB64_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3390,23 +4645,42 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB65_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_or_b32_e32 v1, s34, v3 +; SI-NEXT: v_or_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB65_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3416,24 +4690,50 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB65_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v1, s7, v3 +; VI-NEXT: v_or_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB65_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_or_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB65_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3445,25 +4745,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB66_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v5 +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB66_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3471,24 +4788,52 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_scalar(ptr addrspace(1) inreg %p ; VI-LABEL: global_atomic_or_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB66_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_or_b32_e32 v5, s7, v7 +; VI-NEXT: v_or_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB66_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB66_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw or ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3499,23 +4844,42 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB67_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_or_b32_e32 v3, s34, v5 +; SI-NEXT: v_or_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB67_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3526,23 +4890,49 @@ define amdgpu_gfx i64 @global_atomic_or_i64_ret_offset_scalar(ptr addrspace(1) i ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB67_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_or_b32_e32 v5, s7, v7 +; VI-NEXT: v_or_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB67_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_or_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_or_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB67_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst @@ -3557,9 +4947,29 @@ define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr addr ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB68_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_or_b32_e32 v5, v7, v3 +; SI-NEXT: v_or_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB68_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3568,17 +4978,47 @@ define void @global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB68_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_or_b32_e32 v5, v7, v3 +; VI-NEXT: v_or_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB68_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB68_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3589,34 +5029,88 @@ define i64 @global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr addrspa ; SI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_or_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB69_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_or_b32_e32 v9, v11, v6 +; SI-NEXT: v_or_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB69_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB69_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_or_b32_e32 v7, v9, v3 +; VI-NEXT: v_or_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB69_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_or_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_or_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB69_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw or ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -3635,26 +5129,76 @@ define void @global_atomic_xor_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB70_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB70_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB70_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB70_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB70_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3668,9 +5212,29 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB71_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB71_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -3679,17 +5243,47 @@ define void @global_atomic_xor_i64_noret_offset(ptr addrspace(1) %out, i64 %in) ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB71_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB71_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB71_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3700,32 +5294,88 @@ define i64 @global_atomic_xor_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_xor_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB72_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v6 +; SI-NEXT: v_xor_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB72_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB72_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB72_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB72_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3735,34 +5385,88 @@ define i64 @global_atomic_xor_i64_ret_offset(ptr addrspace(1) %out, i64 %in) { ; SI-LABEL: global_atomic_xor_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB73_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v6 +; SI-NEXT: v_xor_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB73_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB73_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_xor_b32_e32 v7, v9, v3 +; VI-NEXT: v_xor_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB73_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB73_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3774,25 +5478,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB74_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: v_xor_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB74_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3800,24 +5521,52 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre ; VI-LABEL: global_atomic_xor_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB74_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB74_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB74_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -3828,23 +5577,42 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB75_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_xor_b32_e32 v1, s34, v3 +; SI-NEXT: v_xor_b32_e32 v0, s35, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB75_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3854,24 +5622,50 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace( ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB75_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, s7, v3 +; VI-NEXT: v_xor_b32_e32 v0, s6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB75_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3 +; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB75_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3883,25 +5677,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 ; SI-NEXT: s_mov_b32 s34, s7 ; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB76_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v5 +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB76_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3909,24 +5720,52 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_scalar(ptr addrspace(1) inreg % ; VI-LABEL: global_atomic_xor_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB76_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_xor_b32_e32 v5, s7, v7 +; VI-NEXT: v_xor_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB76_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB76_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw xor ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -3937,23 +5776,42 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s34, s7 +; SI-NEXT: s_mov_b32 s35, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB77_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_xor_b32_e32 v3, s34, v5 +; SI-NEXT: v_xor_b32_e32 v2, s35, v4 +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB77_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -3964,23 +5822,49 @@ define amdgpu_gfx i64 @global_atomic_xor_i64_ret_offset_scalar(ptr addrspace(1) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB77_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_xor_b32_e32 v5, s7, v7 +; VI-NEXT: v_xor_b32_e32 v4, s6, v6 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB77_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_xor_b32_e32 v4, s7, v6 +; GFX9-NEXT: v_xor_b32_e32 v3, s6, v5 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB77_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst @@ -3995,9 +5879,29 @@ define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB78_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_xor_b32_e32 v5, v7, v3 +; SI-NEXT: v_xor_b32_e32 v4, v6, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB78_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -4006,17 +5910,47 @@ define void @global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr add ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB78_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_xor_b32_e32 v5, v7, v3 +; VI-NEXT: v_xor_b32_e32 v4, v6, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB78_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB78_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -4027,34 +5961,88 @@ define i64 @global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr addrsp ; SI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v6, v3 +; SI-NEXT: v_mov_b32_e32 v7, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_xor_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[4:5], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB79_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: v_xor_b32_e32 v9, v11, v6 +; SI-NEXT: v_xor_b32_e32 v8, v10, v7 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB79_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB79_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_xor_b32_e32 v7, v9, v3 +; VI-NEXT: v_xor_b32_e32 v6, v8, v2 +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB79_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3 +; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2 +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB79_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw xor ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9322,26 +11310,85 @@ define void @global_atomic_uinc_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB131_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB131_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB131_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB131_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB131_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9355,9 +11402,32 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB132_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB132_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9366,17 +11436,53 @@ define void @global_atomic_uinc_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB132_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB132_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB132_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9387,32 +11493,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_uinc_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB133_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB133_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB133_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB133_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB133_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9422,34 +11593,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB134_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB134_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB134_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB134_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB134_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9461,25 +11695,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB135_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB135_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9487,24 +11741,58 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB135_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB135_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB135_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9515,23 +11803,45 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v8, s6, 0 +; SI-NEXT: v_writelane_b32 v8, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB136_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v7, v3 +; SI-NEXT: v_mov_b32_e32 v6, v2 +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB136_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v8, 1 +; SI-NEXT: v_readlane_b32 s6, v8, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9541,24 +11851,56 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB136_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v2 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB136_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB136_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9570,25 +11912,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB137_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB137_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9596,24 +11958,58 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[34:35], 0 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB137_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB137_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB137_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9624,23 +12020,45 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v6, s6, 0 +; SI-NEXT: v_writelane_b32 v6, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[36:37], 0 +; SI-NEXT: .LBB138_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v1 +; SI-NEXT: v_mov_b32_e32 v4, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v4 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[34:35], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v3, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v2 +; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_mov_b32_e32 v2, v4 +; SI-NEXT: v_mov_b32_e32 v3, v5 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5] +; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37] +; SI-NEXT: s_andn2_b64 exec, exec, s[36:37] +; SI-NEXT: s_cbranch_execnz .LBB138_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[36:37] +; SI-NEXT: v_readlane_b32 s7, v6, 1 +; SI-NEXT: v_readlane_b32 s6, v6, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9651,23 +12069,55 @@ define amdgpu_gfx i64 @global_atomic_uinc_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[34:35], 0 +; VI-NEXT: .LBB138_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v1 +; VI-NEXT: v_mov_b32_e32 v6, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7] +; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; VI-NEXT: s_andn2_b64 exec, exec, s[34:35] +; VI-NEXT: s_cbranch_execnz .LBB138_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[34:35] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[34:35], 0 +; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc +; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[5:6] +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v0, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[3:6], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[5:6] +; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35] +; GFX9-NEXT: s_cbranch_execnz .LBB138_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[34:35] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9682,9 +12132,32 @@ define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(p ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB139_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, 1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB139_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9693,17 +12166,53 @@ define void @global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB139_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_add_u32_e32 v4, vcc, 1, v6 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB139_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB139_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9714,34 +12223,97 @@ define i64 @global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr ; SI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s4, s6 ; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_inc_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB140_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, 1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v11, vcc +; SI-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; SI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; SI-NEXT: s_cbranch_execnz .LBB140_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[8:9] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[4:5], 0 +; VI-NEXT: .LBB140_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v8 +; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc +; VI-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; VI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; VI-NEXT: s_cbranch_execnz .LBB140_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[4:5] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_cbranch_execnz .LBB140_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -9756,30 +12328,95 @@ define void @global_atomic_udec_wrap_i64_noret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_udec_wrap_i64_noret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB141_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB141_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_noret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB141_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB141_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB141_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9789,13 +12426,38 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB142_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB142_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -9804,17 +12466,57 @@ define void @global_atomic_udec_wrap_i64_noret_offset(ptr addrspace(1) %out, i64 ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB142_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB142_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB142_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9825,32 +12527,103 @@ define i64 @global_atomic_udec_wrap_i64_ret(ptr addrspace(1) %ptr, i64 %in) { ; SI-LABEL: global_atomic_udec_wrap_i64_ret: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 glc +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB143_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB143_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB143_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB143_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] +; VI-NEXT: v_mov_b32_e32 v0, v4 +; VI-NEXT: v_mov_b32_e32 v1, v5 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB143_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -9860,34 +12633,103 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset(ptr addrspace(1) %out, i64 %i ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB144_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB144_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB144_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB144_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB144_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -9899,25 +12741,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB145_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB145_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9925,24 +12791,66 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1 ; VI-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s4 +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: .LBB145_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB145_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB145_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret void @@ -9953,23 +12861,49 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB146_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v2 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v3, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v9, v3 +; SI-NEXT: v_mov_b32_e32 v8, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: v_mov_b32_e32 v2, v6 +; SI-NEXT: v_mov_b32_e32 v3, v7 +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB146_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -9979,24 +12913,64 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 -; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 -; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: v_mov_b32_e32 v4, s34 +; VI-NEXT: v_mov_b32_e32 v5, s35 +; VI-NEXT: flat_load_dwordx2 v[2:3], v[4:5] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v6, s7 +; VI-NEXT: v_mov_b32_e32 v7, s6 +; VI-NEXT: .LBB146_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, v1 +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: v_mov_b32_e32 v2, v0 +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB146_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v5, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB146_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -10008,25 +12982,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: s_mov_b32 s34, s7 -; SI-NEXT: s_mov_b32 s35, s6 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_mov_b32_e32 v0, s35 -; SI-NEXT: v_mov_b32_e32 v1, s34 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB147_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 glc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB147_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10034,24 +13032,66 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_scalar(ptr addrspace(1) i ; VI-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 ; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: .LBB147_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB147_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB147_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %in seq_cst ret i64 %result @@ -10062,23 +13102,49 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill +; SI-NEXT: buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_writelane_b32 v2, s6, 0 -; SI-NEXT: v_writelane_b32 v2, s7, 1 -; SI-NEXT: v_mov_b32_e32 v0, s6 -; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: v_writelane_b32 v10, s6, 0 +; SI-NEXT: v_writelane_b32 v10, s7, 1 +; SI-NEXT: s_mov_b32 s35, s7 +; SI-NEXT: s_mov_b32 s34, s6 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32 +; SI-NEXT: s_mov_b64 s[38:39], 0 +; SI-NEXT: v_mov_b32_e32 v4, s35 +; SI-NEXT: v_mov_b32_e32 v5, s34 +; SI-NEXT: .LBB148_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v9, v1 +; SI-NEXT: v_mov_b32_e32 v8, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v8 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v9, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_cmp_lt_u64_e64 s[36:37], s[34:35], v[8:9] +; SI-NEXT: s_or_b64 vcc, vcc, s[36:37] +; SI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v0, v6 +; SI-NEXT: v_mov_b32_e32 v1, v7 +; SI-NEXT: v_mov_b32_e32 v2, v8 +; SI-NEXT: v_mov_b32_e32 v3, v9 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_readlane_b32 s7, v2, 1 -; SI-NEXT: v_readlane_b32 s6, v2, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; SI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; SI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; SI-NEXT: s_cbranch_execnz .LBB148_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[38:39] +; SI-NEXT: v_readlane_b32 s7, v10, 1 +; SI-NEXT: v_readlane_b32 s6, v10, 0 ; SI-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload +; SI-NEXT: buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: s_mov_b64 exec, s[34:35] ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] @@ -10089,23 +13155,63 @@ define amdgpu_gfx i64 @global_atomic_udec_wrap_i64_ret_offset_scalar(ptr addrspa ; VI-NEXT: s_add_u32 s34, s4, 32 ; VI-NEXT: s_addc_u32 s35, s5, 0 ; VI-NEXT: v_mov_b32_e32 v2, s34 -; VI-NEXT: v_mov_b32_e32 v0, s6 -; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: v_mov_b32_e32 v3, s35 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[2:3] +; VI-NEXT: s_mov_b64 s[38:39], 0 +; VI-NEXT: v_mov_b32_e32 v4, s7 +; VI-NEXT: v_mov_b32_e32 v5, s6 +; VI-NEXT: .LBB148_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9] +; VI-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37] +; VI-NEXT: s_or_b64 vcc, vcc, s[34:35] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; VI-NEXT: s_andn2_b64 exec, exec, s[38:39] +; VI-NEXT: s_cbranch_execnz .LBB148_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[38:39] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset_scalar: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 +; GFX9-NEXT: s_mov_b64 s[38:39], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 +; GFX9-NEXT: v_mov_b32_e32 v4, s6 +; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-NEXT: v_mov_b32_e32 v7, v0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[7:8] +; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[7:8] +; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v7 +; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v8, s[36:37] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v0, v4, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v2, v[5:8], s[4:5] offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8] +; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39] +; GFX9-NEXT: s_cbranch_execnz .LBB148_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[38:39] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -10116,13 +13222,38 @@ define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(p ; SI-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB149_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_add_i32_e32 v4, vcc, -1, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, -1, v7, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v7 +; SI-NEXT: v_mov_b32_e32 v10, v6 +; SI-NEXT: v_mov_b32_e32 v9, v5 +; SI-NEXT: v_mov_b32_e32 v8, v4 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: v_mov_b32_e32 v6, v8 +; SI-NEXT: v_mov_b32_e32 v7, v9 +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB149_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; @@ -10131,17 +13262,57 @@ define void @global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(p ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; VI-NEXT: flat_load_dwordx2 v[6:7], v[0:1] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB149_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; VI-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6 +; VI-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; VI-NEXT: v_mov_b32_e32 v7, v5 +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: v_mov_b32_e32 v6, v4 +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB149_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[2:3], off offset:32 +; GFX9-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB149_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 @@ -10152,34 +13323,103 @@ define i64 @global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr ; SI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s6 -; SI-NEXT: buffer_atomic_dec_x2 v[2:3], v[0:1], s[4:7], 0 addr64 offset:32 glc +; SI-NEXT: v_mov_b32_e32 v5, v3 +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v7, v1 +; SI-NEXT: v_mov_b32_e32 v6, v0 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s10 +; SI-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[8:11], 0 addr64 offset:32 +; SI-NEXT: s_mov_b64 s[6:7], 0 +; SI-NEXT: .LBB150_1: ; %atomicrmw.start +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, v1 +; SI-NEXT: v_mov_b32_e32 v10, v0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_add_i32_e32 v0, vcc, -1, v10 +; SI-NEXT: v_addc_u32_e32 v1, vcc, -1, v11, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[4:5] +; SI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; SI-NEXT: v_cndmask_b32_e32 v9, v1, v5, vcc +; SI-NEXT: v_cndmask_b32_e32 v8, v0, v4, vcc +; SI-NEXT: v_mov_b32_e32 v0, v8 +; SI-NEXT: v_mov_b32_e32 v1, v9 +; SI-NEXT: v_mov_b32_e32 v2, v10 +; SI-NEXT: v_mov_b32_e32 v3, v11 +; SI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[8:11], 0 addr64 offset:32 glc ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_wbinvl1 -; SI-NEXT: v_mov_b32_e32 v0, v2 -; SI-NEXT: v_mov_b32_e32 v1, v3 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11] +; SI-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SI-NEXT: s_andn2_b64 exec, exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB150_1 +; SI-NEXT: ; %bb.2: ; %atomicrmw.end +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_waitcnt expcnt(0) ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; VI-NEXT: v_add_u32_e32 v4, vcc, 32, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc +; VI-NEXT: flat_load_dwordx2 v[0:1], v[4:5] +; VI-NEXT: s_mov_b64 s[8:9], 0 +; VI-NEXT: .LBB150_1: ; %atomicrmw.start +; VI-NEXT: ; =>This Inner Loop Header: Depth=1 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_mov_b32_e32 v9, v1 +; VI-NEXT: v_mov_b32_e32 v8, v0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3] +; VI-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8 +; VI-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7] +; VI-NEXT: s_or_b64 vcc, vcc, s[4:5] +; VI-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc +; VI-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9] +; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; VI-NEXT: s_andn2_b64 exec, exec, s[8:9] +; VI-NEXT: s_cbranch_execnz .LBB150_1 +; VI-NEXT: ; %bb.2: ; %atomicrmw.end +; VI-NEXT: s_or_b64 exec, exec, s[8:9] ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: global_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[0:1], v[2:3], off offset:32 glc +; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:32 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, v5 +; GFX9-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3] +; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6 +; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7] +; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:32 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_cbranch_execnz .LBB150_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %result = atomicrmw udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll index 19cb7d21471d..d821caa5b1c6 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll @@ -151,8 +151,17 @@ define i16 @test_atomicrmw_and_i16_global_system(ptr addrspace(1) %ptr, i16 %val ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -204,8 +213,17 @@ define i16 @test_atomicrmw_or_i16_global_system(ptr addrspace(1) %ptr, i16 %valu ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; @@ -224,8 +242,17 @@ define i16 @test_atomicrmw_xor_i16_global_system(ptr addrspace(1) %ptr, i16 %val ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i16 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16 ; CHECK-NEXT: ret i16 [[EXTRACTED]] ; diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll index 08288848efd6..9a03d2164e1d 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll @@ -112,7 +112,16 @@ define i32 @test_atomicrmw_add_i32_global_system__amdgpu_no_fine_grained_memory_ define i32 @test_atomicrmw_sub_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst @@ -122,7 +131,16 @@ define i32 @test_atomicrmw_sub_i32_global_system(ptr addrspace(1) %ptr, i32 %val define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -132,7 +150,16 @@ define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory( define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -142,7 +169,16 @@ define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_remote_memory(ptr ad define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -157,7 +193,16 @@ define i32 @test_atomicrmw_sub_i32_global_system__amdgpu_no_fine_grained_memory_ define i32 @test_atomicrmw_and_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst @@ -167,7 +212,16 @@ define i32 @test_atomicrmw_and_i32_global_system(ptr addrspace(1) %ptr, i32 %val define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -177,7 +231,16 @@ define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory( define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -187,7 +250,16 @@ define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_remote_memory(ptr ad define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_and_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -287,7 +359,16 @@ define i32 @test_atomicrmw_nand_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_or_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst @@ -297,7 +378,16 @@ define i32 @test_atomicrmw_or_i32_global_system(ptr addrspace(1) %ptr, i32 %valu define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -307,7 +397,16 @@ define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory(p define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -317,7 +416,16 @@ define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_remote_memory(ptr add define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -332,7 +440,16 @@ define i32 @test_atomicrmw_or_i32_global_system__amdgpu_no_fine_grained_memory__ define i32 @test_atomicrmw_xor_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst @@ -342,7 +459,16 @@ define i32 @test_atomicrmw_xor_i32_global_system(ptr addrspace(1) %ptr, i32 %val define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -352,7 +478,16 @@ define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory( define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -362,7 +497,16 @@ define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_remote_memory(ptr ad define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_xor_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -717,7 +861,18 @@ define i32 @test_atomicrmw_umin_i32_global_system__amdgpu_no_fine_grained_memory define i32 @test_atomicrmw_uinc_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst @@ -727,7 +882,18 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system(ptr addrspace(1) %ptr, i3 define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -737,7 +903,18 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_m define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -747,7 +924,18 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_remote_memory( define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i32 0, i32 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -762,7 +950,20 @@ define i32 @test_atomicrmw_uinc_wrap_i32_global_system__amdgpu_no_fine_grained_m define i32 @test_atomicrmw_udec_wrap_i32_global_system(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4 +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst @@ -772,7 +973,20 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_system(ptr addrspace(1) %ptr, i3 define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -782,7 +996,20 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_m define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -792,7 +1019,20 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_remote_memory( define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) { ; COMMON-LABEL: define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i32 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i32 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i32 [[VALUE]], i32 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i32 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll index 4f3979f25076..9e28f8f5ac3f 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll @@ -112,7 +112,16 @@ define i64 @test_atomicrmw_add_i64_global_system__amdgpu_no_fine_grained_memory_ define i64 @test_atomicrmw_sub_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst @@ -122,7 +131,16 @@ define i64 @test_atomicrmw_sub_i64_global_system(ptr addrspace(1) %ptr, i64 %val define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -132,7 +150,16 @@ define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory( define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -142,7 +169,16 @@ define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_remote_memory(ptr ad define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw sub ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = sub i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw sub ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -157,7 +193,16 @@ define i64 @test_atomicrmw_sub_i64_global_system__amdgpu_no_fine_grained_memory_ define i64 @test_atomicrmw_and_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst @@ -167,7 +212,16 @@ define i64 @test_atomicrmw_and_i64_global_system(ptr addrspace(1) %ptr, i64 %val define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -177,7 +231,16 @@ define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory( define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -187,7 +250,16 @@ define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_remote_memory(ptr ad define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_and_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw and ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = and i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw and ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -287,7 +359,16 @@ define i64 @test_atomicrmw_nand_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_or_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst @@ -297,7 +378,16 @@ define i64 @test_atomicrmw_or_i64_global_system(ptr addrspace(1) %ptr, i64 %valu define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -307,7 +397,16 @@ define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory(p define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -317,7 +416,16 @@ define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_remote_memory(ptr add define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = or i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -332,7 +440,16 @@ define i64 @test_atomicrmw_or_i64_global_system__amdgpu_no_fine_grained_memory__ define i64 @test_atomicrmw_xor_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst @@ -342,7 +459,16 @@ define i64 @test_atomicrmw_xor_i64_global_system(ptr addrspace(1) %ptr, i64 %val define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -352,7 +478,16 @@ define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory( define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -362,7 +497,16 @@ define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_remote_memory(ptr ad define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_xor_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw xor ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[NEW:%.*]] = xor i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw xor ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -717,7 +861,18 @@ define i64 @test_atomicrmw_umin_i64_global_system__amdgpu_no_fine_grained_memory define i64 @test_atomicrmw_uinc_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst @@ -727,7 +882,18 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system(ptr addrspace(1) %ptr, i6 define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -737,7 +903,18 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_m define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -747,7 +924,18 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_remote_memory( define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw uinc_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = add i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i64 0, i64 [[TMP2]] +; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 @@ -762,7 +950,20 @@ define i64 @test_atomicrmw_uinc_wrap_i64_global_system__amdgpu_no_fine_grained_m define i64 @test_atomicrmw_udec_wrap_i64_global_system(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8 +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst @@ -772,7 +973,20 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_system(ptr addrspace(1) %ptr, i6 define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0 @@ -782,7 +996,20 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_m define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0 @@ -792,7 +1019,20 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_remote_memory( define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) { ; COMMON-LABEL: define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( ; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw udec_wrap ptr addrspace(1) [[PTR]], i64 [[VALUE]] seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8 +; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] +; COMMON: atomicrmw.start: +; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; COMMON-NEXT: [[TMP2:%.*]] = sub i64 [[LOADED]], 1 +; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[LOADED]], 0 +; COMMON-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[LOADED]], [[VALUE]] +; COMMON-NEXT: [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]] +; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i64 [[VALUE]], i64 [[TMP2]] +; COMMON-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8 +; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP6]], 1 +; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP6]], 0 +; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; COMMON: atomicrmw.end: ; COMMON-NEXT: ret i64 [[NEWLOADED]] ; %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll index 088371f461ec..b548943a326b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll @@ -162,8 +162,17 @@ define i8 @test_atomicrmw_and_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] ; CHECK-NEXT: [[ANDOPERAND:%.*]] = or i32 [[VALOPERAND_SHIFTED]], [[INV_MASK]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = and i32 [[LOADED]], [[ANDOPERAND]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -215,8 +224,17 @@ define i8 @test_atomicrmw_or_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; @@ -235,8 +253,17 @@ define i8 @test_atomicrmw_xor_i8_global_system(ptr addrspace(1) %ptr, i8 %value) ; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1 ; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32 ; CHECK-NEXT: [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]] -; CHECK-NEXT: [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] seq_cst, align 4 -; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = xor i32 [[LOADED]], [[VALOPERAND_SHIFTED]] +; CHECK-NEXT: [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1 +; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: +; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]] ; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8 ; CHECK-NEXT: ret i8 [[EXTRACTED]] ; diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll index dc1107d9130d..9c6a76a194d3 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-integer-ops-0-to-add-0.ll @@ -93,7 +93,16 @@ define i32 @test_atomicrmw_or_0_local(ptr addrspace(3) %ptr) { define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_1_global_system( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 1 seq_cst, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 1 seq_cst @@ -103,7 +112,16 @@ define i32 @test_atomicrmw_or_1_global_system(ptr addrspace(1) %ptr) { define i32 @test_atomicrmw_or_var_global_system(ptr addrspace(1) %ptr, i32 %val) { ; CHECK-LABEL: define i32 @test_atomicrmw_or_var_global_system( ; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[RES:%.*]] = atomicrmw or ptr addrspace(1) [[PTR]], i32 [[VAL]] seq_cst, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR]], align 4 +; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] +; CHECK: atomicrmw.start: +; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], [[ATOMICRMW_START]] ] +; CHECK-NEXT: [[NEW:%.*]] = or i32 [[LOADED]], [[VAL]] +; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[NEW]] seq_cst seq_cst, align 4 +; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1 +; CHECK-NEXT: [[RES]] = extractvalue { i32, i1 } [[TMP2]], 0 +; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CHECK: atomicrmw.end: ; CHECK-NEXT: ret i32 [[RES]] ; %res = atomicrmw or ptr addrspace(1) %ptr, i32 %val seq_cst