llvm-project/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
Sameer Sahasrabuddhe a34a024812
[AMDGPU][SIInsertWaitCnts] skip meta instructions early (#145720)
When iterating over a block, meta instructions have no effect on wait counts,
but their presence drops the reference to earlier waitcnt instructions before
they are processed. This results in spurious wait counts, which do not affect
correctness, but are also not required in the resulting program. Skipping meta
instructions as soon as they are seen cleans this up.
2025-07-01 22:02:48 +05:30

21533 lines
970 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx942 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX942-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
define amdgpu_kernel void @global_system_unordered_load(
; GFX6-LABEL: global_system_unordered_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_unordered_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3]
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_unordered_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3]
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_unordered_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_unordered_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in unordered, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_monotonic_load(
; GFX6-LABEL: global_system_monotonic_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in monotonic, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_acquire_load(
; GFX6-LABEL: global_system_acquire_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in acquire, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_seq_cst_load(
; GFX6-LABEL: global_system_seq_cst_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_unordered_store(
; GFX6-LABEL: global_system_unordered_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_unordered_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_unordered_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_unordered_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_unordered_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out unordered, align 4
ret void
}
define amdgpu_kernel void @global_system_monotonic_store(
; GFX6-LABEL: global_system_monotonic_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4
ret void
}
define amdgpu_kernel void @global_system_release_store(
; GFX6-LABEL: global_system_release_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_release_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_release_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out release, align 4
ret void
}
define amdgpu_kernel void @global_system_seq_cst_store(
; GFX6-LABEL: global_system_seq_cst_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
ret void
}
define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX6-LABEL: global_system_monotonic_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic
ret void
}
define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX6-LABEL: global_system_acquire_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
ret void
}
define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX6-LABEL: global_system_release_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_release_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_release_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release
ret void
}
define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX6-LABEL: global_system_acq_rel_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
ret void
}
define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX6-LABEL: global_system_seq_cst_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX6-LABEL: global_system_acquire_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX6-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX6-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX6-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
ret void
}
define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX6-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
ret void
}
define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX6-LABEL: global_system_release_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_release_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release monotonic
ret void
}
define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX6-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
ret void
}
define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
ret void
}
define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX6-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
ret void
}
define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX6-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
ret void
}
define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX6-LABEL: global_system_release_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_release_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
ret void
}
define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX6-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
ret void
}
define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
ret void
}
define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
ret void
}
define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in monotonic seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acquire seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in release seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in acq_rel seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX6-LABEL: global_system_one_as_unordered_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1]
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_unordered_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3]
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_unordered_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3]
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_unordered_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_unordered_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_unordered_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_unordered_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX6-LABEL: global_system_one_as_monotonic_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX6-LABEL: global_system_one_as_acquire_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX6-LABEL: global_system_one_as_seq_cst_load:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, s9
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
; GFX6-NEXT: s_mov_b32 s12, 0x100f000
; GFX6-NEXT: s_mov_b32 s13, -1
; GFX6-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11
; GFX6-NEXT: s_mov_b32 s9, s6
; GFX6-NEXT: s_mov_b32 s10, s13
; GFX6-NEXT: s_mov_b32 s11, s12
; GFX6-NEXT: s_mov_b32 s14, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s14
; GFX6-NEXT: s_mov_b32 s6, s13
; GFX6-NEXT: s_mov_b32 s7, s12
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_load_dword v2, v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_load:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s5
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s9, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s2
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s10, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s10
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s9
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s8
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_load:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_load:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: global_load_dword v1, v0, s[2:3] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_load:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_load:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_load_b32 v1, v0, s[2:3] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_load:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_load:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %in, ptr addrspace(1) %out) {
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4
store i32 %val, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX6-LABEL: global_system_one_as_unordered_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_unordered_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_unordered_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_unordered_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_unordered_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_unordered_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_unordered_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_unordered_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX6-LABEL: global_system_one_as_monotonic_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_release_store(
; GFX6-LABEL: global_system_one_as_release_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX6-LABEL: global_system_one_as_seq_cst_store:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_store:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_store:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_store:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_store:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_store:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_store:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_store:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
i32 %in, ptr addrspace(1) %out) {
entry:
store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX6-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX6-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX6-LABEL: global_system_one_as_release_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX6-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX6-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_swap v[0:1], v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[0:1] sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX6-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX6-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX6-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX6-NEXT: s_load_dword s8, s[8:9], 0x2
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s11, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s9, 0x100f000
; GFX6-NEXT: s_mov_b32 s10, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s11
; GFX6-NEXT: s_mov_b32 s6, s10
; GFX6-NEXT: s_mov_b32 s7, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[4:5], 0x2
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s6
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_swap v1, v0, v1, s[0:1] sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst
store i32 %val, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
; GFX7-NEXT: s_mov_b64 s[10:11], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s4, s8
; GFX7-NEXT: s_mov_b32 s5, s9
; GFX7-NEXT: s_mov_b32 s9, s10
; GFX7-NEXT: s_mov_b32 s8, s11
; GFX7-NEXT: s_add_u32 s4, s4, s9
; GFX7-NEXT: s_addc_u32 s8, s5, s8
; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
; GFX7-NEXT: s_mov_b32 s5, s8
; GFX7-NEXT: v_mov_b32_e32 v2, s7
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v0, v[1:2], s[4:5] offset:16
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[4:5] offset:16
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v0, v[2:3], s[0:1] offset:16 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst monotonic
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst acquire
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") monotonic seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acquire seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") release seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") acq_rel seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}
define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX6-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX6-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX6-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s12, s5
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 killed $sgpr4_sgpr5
; GFX6-NEXT: s_mov_b32 s10, 0x100f000
; GFX6-NEXT: s_mov_b32 s11, -1
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: s_mov_b32 s5, s12
; GFX6-NEXT: s_mov_b32 s6, s11
; GFX6-NEXT: s_mov_b32 s7, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s9
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
; GFX7-NEXT: s_mov_b64 s[12:13], 16
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, s4
; GFX7-NEXT: s_mov_b32 s7, s5
; GFX7-NEXT: s_mov_b32 s11, s12
; GFX7-NEXT: s_mov_b32 s10, s13
; GFX7-NEXT: s_add_u32 s6, s6, s11
; GFX7-NEXT: s_addc_u32 s10, s7, s10
; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
; GFX7-NEXT: s_mov_b32 s7, s10
; GFX7-NEXT: v_mov_b32_e32 v2, s9
; GFX7-NEXT: v_mov_b32_e32 v0, s8
; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1_vol
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: flat_store_dword v[0:1], v2
; GFX7-NEXT: s_endpgm
;
; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s6
; GFX10-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-WGP-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX10-WGP-NEXT: buffer_gl1_inv
; GFX10-WGP-NEXT: buffer_gl0_inv
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-WGP-NEXT: s_endpgm
;
; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
; GFX10-CU-NEXT: v_mov_b32_e32 v3, s6
; GFX10-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX10-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-CU-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[4:5] offset:16 glc
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
; GFX10-CU-NEXT: buffer_gl1_inv
; GFX10-CU-NEXT: buffer_gl0_inv
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-CU-NEXT: s_endpgm
;
; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; SKIP-CACHE-INV: ; %bb.0: ; %entry
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: s_mov_b32 s8, s1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0xf000
; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, -1
; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s8
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s7
; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s5
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s4
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:16 glc
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
;
; GFX90A-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbl2
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NOTTGSPLIT-NEXT: buffer_invl2
; GFX90A-NOTTGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX90A-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX90A-TGSPLIT-NEXT: buffer_wbl2
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[4:5] offset:16 glc
; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX90A-TGSPLIT-NEXT: buffer_invl2
; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
; GFX90A-TGSPLIT-NEXT: global_store_dword v0, v1, s[4:5]
; GFX90A-TGSPLIT-NEXT: s_endpgm
;
; GFX942-NOTTGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-NOTTGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-NOTTGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-NOTTGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-NOTTGSPLIT-NEXT: s_endpgm
;
; GFX942-TGSPLIT-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX942-TGSPLIT: ; %bb.0: ; %entry
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; GFX942-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
; GFX942-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v3, v1
; GFX942-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: global_atomic_cmpswap v1, v0, v[2:3], s[0:1] offset:16 sc0 sc1
; GFX942-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX942-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX942-TGSPLIT-NEXT: global_store_dword v0, v1, s[0:1]
; GFX942-TGSPLIT-NEXT: s_endpgm
;
; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-WGP: ; %bb.0: ; %entry
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX11-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
; GFX11-WGP-NEXT: buffer_gl1_inv
; GFX11-WGP-NEXT: buffer_gl0_inv
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-WGP-NEXT: s_endpgm
;
; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX11-CU: ; %bb.0: ; %entry
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX11-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX11-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX11-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 glc
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
; GFX11-CU-NEXT: buffer_gl1_inv
; GFX11-CU-NEXT: buffer_gl0_inv
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX11-CU-NEXT: s_endpgm
;
; GFX12-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-WGP: ; %bb.0: ; %entry
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
; GFX12-WGP-NEXT: v_mov_b32_e32 v3, s2
; GFX12-WGP-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-WGP-NEXT: v_mov_b32_e32 v2, v3
; GFX12-WGP-NEXT: global_wb scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: s_wait_storecnt 0x0
; GFX12-WGP-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SYS
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-WGP-NEXT: s_endpgm
;
; GFX12-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX12-CU: ; %bb.0: ; %entry
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
; GFX12-CU-NEXT: v_mov_b32_e32 v3, s2
; GFX12-CU-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3
; GFX12-CU-NEXT: global_wb scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: s_wait_storecnt 0x0
; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-CU-NEXT: s_endpgm
ptr addrspace(1) %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i32 4
%val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("one-as") seq_cst seq_cst
%val0 = extractvalue { i32, i1 } %val, 0
store i32 %val0, ptr addrspace(1) %out, align 4
ret void
}