
System scope atomics need to use cmpxchg loops if we know nothing about the allocation the address is from. aea5980e26e6a87dab9f8acb10eb3a59dd143cb1 started this, this expands the set to cover the remaining integer operations. Don't expand xchg and add, those theoretically should work over PCIe. This is a pre-commit which will introduce performance regressions. Subsequent changes will add handling of new atomicrmw metadata, which will avoid the expansion. Note this still isn't conservative enough; we do need to expand some device scope atomics if the memory is in fine-grained remote memory.
12641 lines
525 KiB
LLVM
12641 lines
525 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xchg
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xchg f64
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
|
|
ret double %result
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
|
|
ret double %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
|
|
ret double %result
|
|
}
|
|
|
|
define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i32 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
|
|
ret double %result
|
|
}
|
|
|
|
define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
|
|
; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr double, ptr %out, i64 4
|
|
%result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret double %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw add
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[2:3] offset:32
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw sub
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB30_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB31_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB32_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB32_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB33_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB33_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB34_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB34_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB34_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_subrev_i32_e32 v0, vcc, s6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB35_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB35_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB35_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB36_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB36_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB36_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX7-NEXT: v_subrev_i32_e32 v5, vcc, s6, v7
|
|
; GFX7-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
|
|
; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB37_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB37_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v0
|
|
; GFX9-NEXT: v_subrev_co_u32_e32 v5, vcc, s6, v7
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v8, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[5:8] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB37_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_sub_i32_e32 v4, vcc, v6, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v6, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB38_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB38_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB38_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_sub_i32_e32 v6, vcc, v8, v2
|
|
; GFX7-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v8, v2
|
|
; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v9, v3, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB39_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB39_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v6, v2
|
|
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v3, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB39_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw and
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB40_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB40_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB41_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB41_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB42_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB42_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB43_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB44_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB44_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB45_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB45_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB46_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB47_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB47_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_and_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB47_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB48_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB48_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB49_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB49_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB49_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB49_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw nand
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB50_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB50_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB51_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB51_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB52_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB52_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v7, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v6, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v7, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v6, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB53_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB53_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB54_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX9-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB54_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB55_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v6, s6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v1, v0
|
|
; GFX9-NEXT: v_not_b32_e32 v0, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB55_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB56_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX9-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB56_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX7-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX8-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB57_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v0, s7, v7
|
|
; GFX9-NEXT: v_and_b32_e32 v1, s6, v6
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB57_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_and_b32_e32 v0, v7, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v1, v6, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v5, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v4, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB58_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB58_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GFX7-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GFX7-NEXT: v_not_b32_e32 v7, v0
|
|
; GFX7-NEXT: v_not_b32_e32 v6, v1
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_and_b32_e32 v0, v9, v3
|
|
; GFX8-NEXT: v_and_b32_e32 v1, v8, v2
|
|
; GFX8-NEXT: v_not_b32_e32 v7, v0
|
|
; GFX8-NEXT: v_not_b32_e32 v6, v1
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB59_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_and_b32_e32 v4, v7, v3
|
|
; GFX9-NEXT: v_and_b32_e32 v8, v6, v2
|
|
; GFX9-NEXT: v_not_b32_e32 v5, v4
|
|
; GFX9-NEXT: v_not_b32_e32 v4, v8
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB59_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw or
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB60_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB60_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB60_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB60_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB61_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB61_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB61_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB61_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB62_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB62_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB62_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB62_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_or_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB63_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_or_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB63_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB63_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB63_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB64_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB64_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB64_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB64_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB65_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB65_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB65_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_or_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB65_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB66_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB66_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB66_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB66_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB67_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB67_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB67_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_or_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_or_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB67_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB68_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB68_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB68_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB68_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_or_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB69_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_or_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_or_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB69_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB69_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_or_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_or_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB69_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw xor
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB70_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB70_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB70_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB70_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB71_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB71_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB71_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB71_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB72_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB72_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB72_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB72_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB73_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB73_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB73_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB73_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB74_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB74_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB74_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB74_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB75_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB75_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB75_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_xor_b32_e32 v1, s7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v0, s6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB75_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB76_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB76_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB76_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB76_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB77_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB77_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB77_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, s7, v7
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, s6, v6
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB77_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB78_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB78_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB78_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB78_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_xor_b32_e32 v7, v9, v3
|
|
; GFX7-NEXT: v_xor_b32_e32 v6, v8, v2
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB79_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_xor_b32_e32 v7, v9, v3
|
|
; GFX8-NEXT: v_xor_b32_e32 v6, v8, v2
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB79_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB79_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_xor_b32_e32 v5, v7, v3
|
|
; GFX9-NEXT: v_xor_b32_e32 v4, v6, v2
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB79_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw max
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB80_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB80_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB81_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB81_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB82_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB82_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB83_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB83_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB84_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB84_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB85_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB85_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB86_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB86_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB87_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB87_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_max_i64_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX9-NEXT: .LBB88_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB88_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_max_i64_addr64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX9-NEXT: .LBB90_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB90_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_max_i64_ret_addr64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB92_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB92_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB93_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB93_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw umax
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB94_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB94_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB95_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB95_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB96_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB96_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB97_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB97_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB98_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB98_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB99_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB99_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB100_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB100_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB101_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB101_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_umax_i64_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX9-NEXT: .LBB102_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB102_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB105_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB105_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB106_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB106_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw umin
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB107_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB107_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB108_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB108_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB109_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB109_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB110_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB110_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB111_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB111_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB112_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB112_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB113_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB113_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB114_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB114_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB115_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB115_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB116_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB116_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw min
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB117_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB117_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB118_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB118_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB119_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB119_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB120_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB120_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB121_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB121_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB122_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB122_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB123_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB123_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB124_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB124_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_min_i64_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[4:5] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX9-NEXT: .LBB125_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB125_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: s_add_u32 s0, s0, 32
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%gep = getelementptr i64, ptr %ptr, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: atomic_min_i64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX7-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX8-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_min_i64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s2
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB127_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
|
|
; GFX7-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX7: ; %bb.0: ; %entry
|
|
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX7-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX7-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX7-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX8: ; %bb.0: ; %entry
|
|
; GFX8-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x24
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s6
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX8-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: atomic_min_i64_ret_addr64:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_lshl_b64 s[0:1], s[14:15], 3
|
|
; GFX9-NEXT: s_add_u32 s0, s8, s0
|
|
; GFX9-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s13
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s12
|
|
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v3
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[12:13], v[8:9]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s11
|
|
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX9-NEXT: s_endpgm
|
|
entry:
|
|
%ptr = getelementptr i64, ptr %out, i64 %index
|
|
%tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1
|
|
store i64 %tmp0, ptr %out2
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB129_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB129_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v9, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB130_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB130_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw uinc_wrap
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB131_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB131_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB131_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB131_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB131_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB131_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB132_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB132_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB132_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB132_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB132_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB132_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB133_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB133_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB133_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB133_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB133_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB133_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB134_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB134_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB134_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB134_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB134_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB134_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB135_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB135_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB135_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB135_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB135_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB135_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB136_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v2
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB136_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB136_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v2
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB136_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB136_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v2
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB136_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB137_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB137_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB137_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB137_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB137_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB137_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX7-NEXT: .LBB138_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB138_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX8-NEXT: .LBB138_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB138_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[34:35], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB138_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[6:7]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB138_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB139_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v6
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB139_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB139_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v6
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v7, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB139_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB139_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB139_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX7-NEXT: .LBB140_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 1, v8
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
|
|
; GFX7-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB140_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX8-NEXT: .LBB140_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 1, v8
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v9, vcc
|
|
; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, 0, v1, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v0, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB140_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[4:5], 0
|
|
; GFX9-NEXT: .LBB140_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
|
|
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB140_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
; ---------------------------------------------------------------------
|
|
; atomicrmw udec_wrap
|
|
; ---------------------------------------------------------------------
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB141_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
|
|
; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB141_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v6, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v7, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB141_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB141_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB141_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB141_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB142_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB142_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB142_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB142_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB142_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB142_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB143_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v4, s[6:7], -1, v6
|
|
; GFX7-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB143_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v4, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v5, v[5:6]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB143_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX8-NEXT: v_addc_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB143_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB143_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB143_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB144_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB144_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB144_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB144_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB144_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB144_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX7-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX7-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX7-NEXT: .LBB145_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB145_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s35
|
|
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v3, v[3:4]
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX8-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX8-NEXT: .LBB145_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB145_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB145_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB145_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX7-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX7-NEXT: .LBB146_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v2
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB146_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s35
|
|
; GFX8-NEXT: flat_load_dword v3, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v2, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX8-NEXT: .LBB146_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v2
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB146_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s4
|
|
; GFX9-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s5
|
|
; GFX9-NEXT: .LBB146_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
|
|
; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v2
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v3, s[36:37]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v0
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB146_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX7-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX7-NEXT: .LBB147_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB147_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 4
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v0, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v1, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX8-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX8-NEXT: .LBB147_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB147_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB147_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB147_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX7-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX7-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX7-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX7-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX7-NEXT: .LBB148_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[36:37], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB148_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: s_add_u32 s34, s4, 32
|
|
; GFX8-NEXT: s_addc_u32 s35, s5, 0
|
|
; GFX8-NEXT: s_add_u32 s36, s4, 36
|
|
; GFX8-NEXT: s_addc_u32 s37, s5, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s36
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s37
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s34
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s35
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[2:3]
|
|
; GFX8-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX8-NEXT: .LBB148_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[36:37], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB148_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX9-NEXT: flat_load_dwordx2 v[0:1], v[0:1] offset:32
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
|
; GFX9-NEXT: s_mov_b64 s[38:39], 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v4, s7
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, s6
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s5
|
|
; GFX9-NEXT: .LBB148_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX9-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[8:9]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v8
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v9, s[36:37]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[34:35]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v4, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v5, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB148_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[38:39]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v8, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB149_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v6
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB149_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v7, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v6, v[8:9]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB149_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v6
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v7, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v5, v1, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB149_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB149_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB149_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret void
|
|
}
|
|
|
|
define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
|
|
; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 32, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 36, v0
|
|
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX7-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX7-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX7-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX7-NEXT: .LBB150_1: ; %atomicrmw.start
|
|
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX7-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
|
|
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], -1, v8
|
|
; GFX7-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
|
|
; GFX7-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
|
|
; GFX7-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
|
|
; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: buffer_wbinvl1_vol
|
|
; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_cbranch_execnz .LBB150_1
|
|
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 32, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 36, v0
|
|
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
|
|
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
|
; GFX8-NEXT: flat_load_dword v0, v[4:5]
|
|
; GFX8-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX8-NEXT: .LBB150_1: ; %atomicrmw.start
|
|
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
|
|
; GFX8-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[2:3]
|
|
; GFX8-NEXT: v_add_u32_e64 v0, s[6:7], -1, v8
|
|
; GFX8-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v9, s[6:7]
|
|
; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc
|
|
; GFX8-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc
|
|
; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: buffer_wbinvl1_vol
|
|
; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
|
|
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_cbranch_execnz .LBB150_1
|
|
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:32
|
|
; GFX9-NEXT: s_mov_b64 s[8:9], 0
|
|
; GFX9-NEXT: .LBB150_1: ; %atomicrmw.start
|
|
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, v4
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
|
|
; GFX9-NEXT: v_cmp_gt_u64_e64 s[4:5], v[6:7], v[2:3]
|
|
; GFX9-NEXT: v_add_co_u32_e64 v4, s[6:7], -1, v6
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v5, s[6:7], -1, v7, s[6:7]
|
|
; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5]
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
|
|
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
|
|
; GFX9-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: buffer_wbinvl1_vol
|
|
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
|
|
; GFX9-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
|
|
; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: s_cbranch_execnz .LBB150_1
|
|
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
|
|
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
%gep = getelementptr i64, ptr %out, i64 4
|
|
%result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
|
|
ret i64 %result
|
|
}
|
|
|
|
!0 = !{}
|
|
!1 = !{i32 5, i32 6}
|