diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll index 80445f793934..97d52d5f1f26 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -78,6 +79,10 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU-LABEL: workgroup_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -145,6 +150,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -217,6 +226,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -289,6 +302,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -359,6 +376,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -426,6 +447,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -498,6 +523,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -570,6 +599,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -662,6 +695,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -744,6 +784,14 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -842,6 +890,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -940,6 +997,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1032,6 +1098,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1114,6 +1187,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1212,6 +1293,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1310,6 +1400,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1404,6 +1503,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1490,6 +1596,15 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1594,6 +1709,16 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1698,6 +1823,16 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1792,6 +1927,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1878,6 +2020,15 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -1982,6 +2133,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void @@ -2086,6 +2247,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"global"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll index 7a419a5031ba..cc42428e1aa0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX6-LABEL: workgroup_acquire_fence: @@ -76,6 +77,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -142,6 +148,10 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU-LABEL: workgroup_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -208,6 +218,10 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -274,6 +288,10 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -331,6 +349,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -388,6 +410,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -445,6 +471,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -502,6 +532,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -570,6 +604,11 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -636,6 +675,10 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-LABEL: agent_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -702,6 +745,10 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-LABEL: agent_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -768,6 +815,10 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-LABEL: agent_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -825,6 +876,10 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-LABEL: agent_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -882,6 +937,10 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-LABEL: agent_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -939,6 +998,10 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: agent_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -996,6 +1059,10 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: agent_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1064,6 +1131,11 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1130,6 +1202,10 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-LABEL: system_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1196,6 +1272,10 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-LABEL: system_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1262,6 +1342,10 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-LABEL: system_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1319,6 +1403,10 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-LABEL: system_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1376,6 +1464,10 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-LABEL: system_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1433,6 +1525,10 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: system_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void @@ -1490,6 +1586,10 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: system_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"} ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll index 0e459ed0f124..b3f6533d4388 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX6-LABEL: singlethread_acquire_fence: @@ -65,6 +66,10 @@ define amdgpu_kernel void @singlethread_acquire_fence() { ; GFX12-CU-LABEL: singlethread_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acquire ret void @@ -122,6 +127,10 @@ define amdgpu_kernel void @singlethread_release_fence() { ; GFX12-CU-LABEL: singlethread_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") release ret void @@ -179,6 +188,10 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") acq_rel ret void @@ -236,6 +249,10 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread") seq_cst ret void @@ -293,6 +310,10 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acquire ret void @@ -350,6 +371,10 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() { ; GFX12-CU-LABEL: singlethread_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") release ret void @@ -407,6 +432,10 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: singlethread_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") acq_rel ret void @@ -464,6 +493,10 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: singlethread_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: singlethread_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("singlethread-one-as") seq_cst ret void @@ -521,6 +554,10 @@ define amdgpu_kernel void @wavefront_acquire_fence() { ; GFX12-CU-LABEL: wavefront_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acquire ret void @@ -578,6 +615,10 @@ define amdgpu_kernel void @wavefront_release_fence() { ; GFX12-CU-LABEL: wavefront_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") release ret void @@ -635,6 +676,10 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") acq_rel ret void @@ -692,6 +737,10 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront") seq_cst ret void @@ -749,6 +798,10 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acquire ret void @@ -806,6 +859,10 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() { ; GFX12-CU-LABEL: wavefront_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") release ret void @@ -863,6 +920,10 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: wavefront_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") acq_rel ret void @@ -920,6 +981,10 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: wavefront_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: wavefront_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("wavefront-one-as") seq_cst ret void @@ -998,6 +1063,11 @@ define amdgpu_kernel void @workgroup_acquire_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acquire ret void @@ -1073,6 +1143,11 @@ define amdgpu_kernel void @workgroup_release_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") release ret void @@ -1153,6 +1228,11 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") acq_rel ret void @@ -1233,6 +1313,11 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() { ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup") seq_cst ret void @@ -1303,6 +1388,10 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acquire_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acquire ret void @@ -1370,6 +1459,10 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() { ; GFX12-CU-LABEL: workgroup_one_as_release_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") release ret void @@ -1442,6 +1535,10 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() { ; GFX12-CU-LABEL: workgroup_one_as_acq_rel_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") acq_rel ret void @@ -1514,6 +1611,10 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() { ; GFX12-CU-LABEL: workgroup_one_as_seq_cst_fence: ; GFX12-CU: ; %bb.0: ; %entry ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: workgroup_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_endpgm entry: fence syncscope("workgroup-one-as") seq_cst ret void @@ -1606,6 +1707,13 @@ define amdgpu_kernel void @agent_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acquire ret void @@ -1688,6 +1796,14 @@ define amdgpu_kernel void @agent_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") release ret void @@ -1786,6 +1902,15 @@ define amdgpu_kernel void @agent_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") acq_rel ret void @@ -1884,6 +2009,15 @@ define amdgpu_kernel void @agent_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent") seq_cst ret void @@ -1976,6 +2110,13 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acquire ret void @@ -2058,6 +2199,14 @@ define amdgpu_kernel void @agent_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") release ret void @@ -2156,6 +2305,15 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") acq_rel ret void @@ -2254,6 +2412,15 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: agent_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm entry: fence syncscope("agent-one-as") seq_cst ret void @@ -2348,6 +2515,13 @@ define amdgpu_kernel void @system_acquire_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acquire ret void @@ -2434,6 +2608,15 @@ define amdgpu_kernel void @system_release_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence release ret void @@ -2538,6 +2721,16 @@ define amdgpu_kernel void @system_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence acq_rel ret void @@ -2642,6 +2835,16 @@ define amdgpu_kernel void @system_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence seq_cst ret void @@ -2736,6 +2939,13 @@ define amdgpu_kernel void @system_one_as_acquire_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acquire_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acquire ret void @@ -2822,6 +3032,15 @@ define amdgpu_kernel void @system_one_as_release_fence() { ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_release_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") release ret void @@ -2926,6 +3145,16 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_acq_rel_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") acq_rel ret void @@ -3030,6 +3259,16 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() { ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: system_one_as_seq_cst_fence: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm entry: fence syncscope("one-as") seq_cst ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll index 07ad8cb0c4a3..36adbc001111 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_agent_unordered_load( ; GFX7-LABEL: flat_agent_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") monotonic, align 4 @@ -566,6 +589,18 @@ define amdgpu_kernel void @flat_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") acquire, align 4 @@ -789,6 +824,24 @@ define amdgpu_kernel void @flat_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent") seq_cst, align 4 @@ -939,6 +992,16 @@ define amdgpu_kernel void @flat_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") unordered, align 4 @@ -1088,6 +1151,16 @@ define amdgpu_kernel void @flat_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") monotonic, align 4 @@ -1261,6 +1334,20 @@ define amdgpu_kernel void @flat_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") release, align 4 @@ -1434,6 +1521,20 @@ define amdgpu_kernel void @flat_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent") seq_cst, align 4 @@ -1583,6 +1684,16 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") monotonic @@ -1763,6 +1874,18 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -1936,6 +2059,20 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") release @@ -2140,6 +2277,22 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -2344,6 +2497,22 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -2552,6 +2721,19 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acquire @@ -2789,6 +2971,25 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") acq_rel @@ -3026,6 +3227,25 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst @@ -3264,6 +3484,20 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3533,6 +3767,22 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3795,6 +4045,24 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4088,6 +4356,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4381,6 +4669,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4650,6 +4958,22 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4919,6 +5243,22 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5212,6 +5552,26 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5505,6 +5865,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5798,6 +6178,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6091,6 +6491,26 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6384,6 +6804,26 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6677,6 +7117,26 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6970,6 +7430,26 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7263,6 +7743,26 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7545,6 +8045,22 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7844,6 +8360,23 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8152,6 +8685,26 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8479,6 +9032,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8806,6 +9382,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9109,6 +9708,25 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9408,6 +10026,23 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9735,6 +10370,29 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10062,6 +10720,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10389,6 +11070,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10716,6 +11420,29 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11039,6 +11766,27 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11366,6 +12114,29 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11693,6 +12464,29 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12020,6 +12814,29 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +13021,17 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") unordered, align 4 @@ -12386,6 +13214,17 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") monotonic, align 4 @@ -12593,6 +13432,19 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") acquire, align 4 @@ -12826,6 +13678,25 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("agent-one-as") seq_cst, align 4 @@ -12976,6 +13847,16 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") unordered, align 4 @@ -13125,6 +14006,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") monotonic, align 4 @@ -13298,6 +14189,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") release, align 4 @@ -13471,6 +14376,20 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("agent-one-as") seq_cst, align 4 @@ -13620,6 +14539,16 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") monotonic @@ -13796,6 +14725,18 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -13969,6 +14910,20 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") release @@ -14169,6 +15124,22 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14369,6 +15340,22 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14587,6 +15574,20 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acquire @@ -14834,6 +15835,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") acq_rel @@ -15081,6 +16102,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent-one-as") seq_cst @@ -15319,6 +16360,20 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15584,6 +16639,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15846,6 +16917,24 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16135,6 +17224,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16424,6 +17533,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16689,6 +17818,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16954,6 +18099,22 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17243,6 +18404,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17532,6 +18713,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17821,6 +19022,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18110,6 +19331,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18399,6 +19640,26 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18688,6 +19949,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20258,26 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19266,6 +20567,26 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19548,6 +20869,22 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19857,6 +21194,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20165,6 +21520,26 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20502,6 +21877,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20839,6 +22238,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21152,6 +22575,26 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21461,6 +22904,24 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21798,6 +23259,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22135,6 +23620,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22472,6 +23981,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22809,6 +24342,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23142,6 +24699,28 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23479,6 +25058,30 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23816,6 +25419,30 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24153,6 +25780,30 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll index a00af8e5b658..8d98f532908f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-LABEL: flat_last_use_load_0: @@ -16,6 +17,17 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_last_use_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -55,6 +67,21 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_last_use_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr %in, i32 %tid @@ -80,6 +107,19 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) { ; GFX12-NEXT: s_wait_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr %out @@ -100,6 +140,17 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_last_use_and_nontemporal_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0 store i32 %val, ptr %out diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll index 3c24c36ec547..af48eaf8fcda 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4, !nontemporal !0 @@ -555,6 +567,21 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -739,6 +766,17 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -1095,6 +1133,20 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1293,6 +1345,19 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll index b88a10ab24a9..871c941dd6dc 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX7-LABEL: flat_singlethread_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread") seq_cst, align 4 @@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") unordered, align 4 @@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") monotonic, align 4 @@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") release, align 4 @@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread") seq_cst, align 4 @@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") monotonic @@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") release @@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acquire @@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") acq_rel @@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread") seq_cst @@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") unordered, align 4 @@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") monotonic, align 4 @@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") acquire, align 4 @@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") unordered, align 4 @@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") monotonic, align 4 @@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") release, align 4 @@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") release @@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20965,6 +22190,22 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll index 919fc3e8f4e4..9d70a2437e55 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_system_unordered_load( ; GFX7-LABEL: flat_system_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in monotonic, align 4 @@ -568,6 +591,18 @@ define amdgpu_kernel void @flat_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in acquire, align 4 @@ -793,6 +828,24 @@ define amdgpu_kernel void @flat_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in seq_cst, align 4 @@ -943,6 +996,16 @@ define amdgpu_kernel void @flat_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out unordered, align 4 @@ -1092,6 +1155,16 @@ define amdgpu_kernel void @flat_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out monotonic, align 4 @@ -1269,6 +1342,21 @@ define amdgpu_kernel void @flat_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out release, align 4 @@ -1446,6 +1534,21 @@ define amdgpu_kernel void @flat_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out seq_cst, align 4 @@ -1595,6 +1698,16 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in monotonic @@ -1777,6 +1890,18 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -1954,6 +2079,21 @@ define amdgpu_kernel void @flat_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in release @@ -2164,6 +2304,23 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -2374,6 +2531,23 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -2584,6 +2758,19 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acquire @@ -2827,6 +3014,26 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in acq_rel @@ -3070,6 +3277,26 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in seq_cst @@ -3308,6 +3535,20 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3579,6 +3820,22 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3845,6 +4102,25 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4144,6 +4420,27 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4443,6 +4740,27 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4714,6 +5032,22 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4985,6 +5319,22 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5284,6 +5634,27 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5583,6 +5954,27 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5882,6 +6274,27 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6181,6 +6594,27 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6480,6 +6914,27 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6779,6 +7234,27 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7078,6 +7554,27 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7377,6 +7874,27 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7659,6 +8177,22 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7960,6 +8494,23 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8272,6 +8823,27 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8605,6 +9177,30 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8938,6 +9534,30 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9243,6 +9863,25 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9544,6 +10183,23 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9877,6 +10533,30 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10210,6 +10890,30 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10543,6 +11247,30 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10876,6 +11604,30 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11205,6 +11957,28 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11538,6 +12312,30 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -11871,6 +12669,30 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12204,6 +13026,30 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -12388,6 +13234,17 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") unordered, align 4 @@ -12570,6 +13427,17 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") monotonic, align 4 @@ -12779,6 +13647,19 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") acquire, align 4 @@ -13014,6 +13895,25 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("one-as") seq_cst, align 4 @@ -13164,6 +14064,16 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") unordered, align 4 @@ -13313,6 +14223,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") monotonic, align 4 @@ -13490,6 +14410,21 @@ define amdgpu_kernel void @flat_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") release, align 4 @@ -13667,6 +14602,21 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("one-as") seq_cst, align 4 @@ -13816,6 +14766,16 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") monotonic @@ -13994,6 +14954,18 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -14171,6 +15143,21 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") release @@ -14377,6 +15364,23 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -14583,6 +15587,23 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -14803,6 +15824,20 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acquire @@ -15056,6 +16091,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") acq_rel @@ -15309,6 +16365,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("one-as") seq_cst @@ -15547,6 +16624,20 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15814,6 +16905,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16080,6 +17187,25 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16375,6 +17501,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16670,6 +17817,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16937,6 +18105,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17204,6 +18388,22 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17499,6 +18699,27 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17794,6 +19015,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18089,6 +19331,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18384,6 +19647,27 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18679,6 +19963,27 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18974,6 +20279,27 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19269,6 +20595,27 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19564,6 +20911,27 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19846,6 +21214,22 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20157,6 +21541,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20469,6 +21871,27 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20812,6 +22235,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21155,6 +22603,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21470,6 +22943,26 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21781,6 +23274,24 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22124,6 +23635,31 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22467,6 +24003,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -22810,6 +24371,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23153,6 +24739,31 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23492,6 +25103,29 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -23835,6 +25469,31 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24178,6 +25837,31 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -24521,6 +26205,31 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll index a88e0e217fdb..43f015c3a2e0 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX7-LABEL: flat_nontemporal_load_0: @@ -143,6 +144,19 @@ define amdgpu_kernel void @flat_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load volatile i32, ptr %in, align 4 @@ -415,6 +429,23 @@ define amdgpu_kernel void @flat_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -563,6 +594,18 @@ define amdgpu_kernel void @flat_nontemporal_store_0( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load i32, ptr %in, align 4 @@ -831,6 +874,21 @@ define amdgpu_kernel void @flat_nontemporal_store_1( ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s2 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -971,6 +1029,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic volatile i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -1090,6 +1159,17 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic volatile i32 %in, ptr %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll index 7c637a20ab47..f086542b3d1f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX7-LABEL: flat_wavefront_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") monotonic, align 4 @@ -551,6 +574,17 @@ define amdgpu_kernel void @flat_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") acquire, align 4 @@ -733,6 +767,17 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront") seq_cst, align 4 @@ -883,6 +928,16 @@ define amdgpu_kernel void @flat_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") unordered, align 4 @@ -1032,6 +1087,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") monotonic, align 4 @@ -1181,6 +1246,16 @@ define amdgpu_kernel void @flat_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") release, align 4 @@ -1330,6 +1405,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront") seq_cst, align 4 @@ -1479,6 +1564,16 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") monotonic @@ -1628,6 +1723,16 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -1777,6 +1882,16 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") release @@ -1926,6 +2041,16 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2075,6 +2200,16 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2268,6 +2403,18 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acquire @@ -2462,6 +2609,18 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") acq_rel @@ -2656,6 +2815,18 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront") seq_cst @@ -2894,6 +3065,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3132,6 +3317,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3370,6 +3569,20 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3608,6 +3821,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3846,6 +4073,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4084,6 +4325,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4322,6 +4577,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4560,6 +4829,20 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4798,6 +5081,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5036,6 +5333,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5274,6 +5585,20 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5512,6 +5837,20 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5750,6 +6089,20 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5988,6 +6341,20 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6226,6 +6593,20 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6508,6 +6889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6792,6 +7189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7076,6 +7489,22 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7360,6 +7789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7644,6 +8089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7928,6 +8389,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8212,6 +8689,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8496,6 +8989,22 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8780,6 +9289,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9064,6 +9589,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9348,6 +9889,22 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9632,6 +10189,22 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9916,6 +10489,22 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10200,6 +10789,22 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10484,6 +11089,22 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10668,6 +11289,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") unordered, align 4 @@ -10850,6 +11482,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") monotonic, align 4 @@ -11032,6 +11675,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") acquire, align 4 @@ -11214,6 +11868,17 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -11364,6 +12029,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") unordered, align 4 @@ -11513,6 +12188,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") monotonic, align 4 @@ -11662,6 +12347,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") release, align 4 @@ -11811,6 +12506,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11960,6 +12665,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -12109,6 +12824,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12258,6 +12983,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") release @@ -12407,6 +13142,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12556,6 +13301,16 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12749,6 +13504,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12943,6 +13710,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -13137,6 +13916,18 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -13375,6 +14166,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13613,6 +14418,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13851,6 +14670,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14089,6 +14922,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14327,6 +15174,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14565,6 +15426,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14803,6 +15678,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15041,6 +15930,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15279,6 +16182,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15517,6 +16434,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15755,6 +16686,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15993,6 +16938,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16231,6 +17190,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16469,6 +17442,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16707,6 +17694,20 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16989,6 +17990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17273,6 +18290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17557,6 +18590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17841,6 +18890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18125,6 +19190,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18409,6 +19490,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18693,6 +19790,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18977,6 +20090,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19261,6 +20390,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19545,6 +20690,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19829,6 +20990,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20113,6 +21290,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20397,6 +21590,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20681,6 +21890,22 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll index 0fd4aa4a7a93..d8e6ad043e06 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -11,6 +11,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX7-LABEL: flat_workgroup_unordered_load: @@ -187,6 +188,17 @@ define amdgpu_kernel void @flat_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4 @@ -369,6 +381,17 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4 @@ -776,6 +810,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4 @@ -926,6 +972,16 @@ define amdgpu_kernel void @flat_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4 @@ -1075,6 +1131,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4 @@ -1241,6 +1307,17 @@ define amdgpu_kernel void @flat_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4 @@ -1407,6 +1484,17 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4 @@ -1556,6 +1644,16 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic @@ -1724,6 +1822,17 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -1890,6 +1999,17 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release @@ -2075,6 +2195,18 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2260,6 +2392,18 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -2465,6 +2609,18 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire @@ -2690,6 +2846,19 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel @@ -2915,6 +3084,19 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst @@ -3153,6 +3335,20 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3410,6 +3606,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3665,6 +3876,21 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -3939,6 +4165,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4213,6 +4455,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4470,6 +4728,21 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -4727,6 +5000,21 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5001,6 +5289,22 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5275,6 +5579,22 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5549,6 +5869,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -5823,6 +6159,22 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6105,6 +6457,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6401,6 +6769,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -6702,6 +7086,23 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7017,6 +7418,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7332,6 +7750,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7630,6 +8065,22 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -7926,6 +8377,22 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8241,6 +8708,23 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8556,6 +9040,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -8871,6 +9372,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9186,6 +9704,23 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9499,6 +10034,23 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -9814,6 +10366,23 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10129,6 +10698,23 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10444,6 +11030,23 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -10628,6 +11231,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4 @@ -10810,6 +11424,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11000,6 +11625,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4 @@ -11202,6 +11838,17 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %in, ptr %out) { entry: %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11352,6 +11999,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4 @@ -11501,6 +12158,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11660,6 +12327,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4 @@ -11819,6 +12496,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm i32 %in, ptr %out) { entry: store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -11968,6 +12655,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12127,6 +12824,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12286,6 +12993,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release @@ -12455,6 +13172,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12624,6 +13351,16 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12825,6 +13562,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13039,6 +13788,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13253,6 +14014,18 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: flat_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13491,6 +14264,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13739,6 +14526,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -13987,6 +14788,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14245,6 +15060,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14503,6 +15332,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14751,6 +15594,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -14999,6 +15856,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15257,6 +16128,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15515,6 +16400,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -15773,6 +16672,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16031,6 +16944,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16289,6 +17216,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16547,6 +17488,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -16805,6 +17760,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17063,6 +18032,20 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17345,6 +18328,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17637,6 +18636,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -17931,6 +18946,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18235,6 +19266,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18539,6 +19586,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -18833,6 +19896,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19125,6 +20204,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19429,6 +20524,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -19733,6 +20844,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20037,6 +21164,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20341,6 +21484,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20643,6 +21802,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -20947,6 +22122,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21251,6 +22442,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 @@ -21555,6 +22762,22 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1] scope:SCOPE_SE +; GFX1250-NEXT: s_endpgm ptr %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll index 74a72e04fa4a..184e15406bfb 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_agent_unordered_load( ; GFX6-LABEL: global_agent_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") monotonic, align 4 @@ -574,6 +597,18 @@ define amdgpu_kernel void @global_agent_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") acquire, align 4 @@ -793,6 +828,24 @@ define amdgpu_kernel void @global_agent_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4 @@ -950,6 +1003,16 @@ define amdgpu_kernel void @global_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") unordered, align 4 @@ -1106,6 +1169,16 @@ define amdgpu_kernel void @global_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") monotonic, align 4 @@ -1287,6 +1360,20 @@ define amdgpu_kernel void @global_agent_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") release, align 4 @@ -1468,6 +1555,20 @@ define amdgpu_kernel void @global_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent") seq_cst, align 4 @@ -1622,6 +1723,16 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") monotonic @@ -1805,6 +1916,18 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -1984,6 +2107,20 @@ define amdgpu_kernel void @global_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") release @@ -2192,6 +2329,22 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -2400,6 +2553,22 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -2598,6 +2767,19 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acquire @@ -2826,6 +3008,25 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") acq_rel @@ -3054,6 +3255,25 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst @@ -3273,6 +3493,20 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3521,6 +3755,22 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3765,6 +4015,24 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4038,6 +4306,26 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4311,6 +4599,26 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4559,6 +4867,22 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4807,6 +5131,22 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5080,6 +5420,26 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5353,6 +5713,26 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5626,6 +6006,26 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5899,6 +6299,26 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6172,6 +6592,26 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6445,6 +6885,26 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6718,6 +7178,26 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6991,6 +7471,26 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7240,6 +7740,22 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7507,6 +8023,23 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7783,6 +8316,26 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8632,29 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8375,6 +8951,29 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8646,6 +9245,25 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8913,6 +9531,23 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9209,6 +9844,29 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9505,6 +10163,29 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9801,6 +10482,29 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10097,6 +10801,29 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10389,6 +11116,27 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10685,6 +11433,29 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10981,6 +11752,29 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11277,6 +12071,29 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -11463,6 +12280,17 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") unordered, align 4 @@ -11647,6 +12475,17 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") monotonic, align 4 @@ -11847,6 +12686,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") acquire, align 4 @@ -12066,6 +12917,24 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("agent-one-as") seq_cst, align 4 @@ -12223,6 +13092,16 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") unordered, align 4 @@ -12379,6 +13258,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") monotonic, align 4 @@ -12560,6 +13449,20 @@ define amdgpu_kernel void @global_agent_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") release, align 4 @@ -12741,6 +13644,20 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("agent-one-as") seq_cst, align 4 @@ -12895,6 +13812,16 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") monotonic @@ -13078,6 +14005,18 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -13257,6 +14196,20 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") release @@ -13465,6 +14418,22 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -13673,6 +14642,22 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -13871,6 +14856,19 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acquire @@ -14099,6 +15097,25 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -14327,6 +15344,25 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -14546,6 +15582,20 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14794,6 +15844,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15038,6 +16104,24 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15311,6 +16395,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15584,6 +16688,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15832,6 +16956,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16080,6 +17220,22 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16353,6 +17509,26 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16626,6 +17802,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16899,6 +18095,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17172,6 +18388,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17445,6 +18681,26 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17718,6 +18974,26 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17991,6 +19267,26 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18264,6 +19560,26 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18513,6 +19829,22 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18780,6 +20112,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19076,6 +20425,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19372,6 +20744,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19643,6 +21038,25 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19910,6 +21324,23 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20206,6 +21637,29 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20502,6 +21956,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20798,6 +22275,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21094,6 +22594,29 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21386,6 +22909,27 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21682,6 +23226,29 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21978,6 +23545,29 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -22274,6 +23864,29 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_DEV ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll index 5f952b98041f..ed2d62356f8f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; GFX12-LABEL: global_last_use_load_0: @@ -14,6 +15,18 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: global_last_use_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -37,6 +50,21 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: global_last_use_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid @@ -58,6 +86,19 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: global_last_use_and_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !amdgpu.last.use !{} store i32 %val, ptr addrspace(1) %out @@ -81,6 +122,21 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: global_last_use_and_nontemporal_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll index 16e55058e4fc..0ad64f5599fe 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_nontemporal_load_0( ; GFX6-LABEL: global_nontemporal_load_0: @@ -189,6 +190,18 @@ define amdgpu_kernel void @global_nontemporal_load_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4, !nontemporal !0 @@ -448,6 +461,21 @@ define amdgpu_kernel void @global_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_NT +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -633,6 +661,18 @@ define amdgpu_kernel void @global_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -866,6 +906,20 @@ define amdgpu_kernel void @global_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] th:TH_STORE_NT ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset th:TH_STORE_NT +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -1056,6 +1110,19 @@ define amdgpu_kernel void @global_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_NT scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll index 8042d3871610..6a5a6e01c741 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX6-LABEL: global_singlethread_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread") seq_cst, align 4 @@ -899,6 +944,16 @@ define amdgpu_kernel void @global_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") unordered, align 4 @@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") monotonic, align 4 @@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") release, align 4 @@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread") seq_cst, align 4 @@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") monotonic @@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") release @@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acquire @@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") acq_rel @@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread") seq_cst @@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") unordered, align 4 @@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") acquire, align 4 @@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") unordered, align 4 @@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") release, align 4 @@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") release @@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll index be148464c156..7ddd515830e1 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_system_unordered_load( ; GFX6-LABEL: global_system_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_system_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in monotonic, align 4 @@ -576,6 +599,18 @@ define amdgpu_kernel void @global_system_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in acquire, align 4 @@ -797,6 +832,24 @@ define amdgpu_kernel void @global_system_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in seq_cst, align 4 @@ -954,6 +1007,16 @@ define amdgpu_kernel void @global_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out unordered, align 4 @@ -1110,6 +1173,16 @@ define amdgpu_kernel void @global_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out monotonic, align 4 @@ -1295,6 +1368,21 @@ define amdgpu_kernel void @global_system_release_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out release, align 4 @@ -1480,6 +1568,21 @@ define amdgpu_kernel void @global_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4 @@ -1634,6 +1737,16 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in monotonic @@ -1819,6 +1932,18 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2002,6 +2127,21 @@ define amdgpu_kernel void @global_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in release @@ -2216,6 +2356,23 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -2430,6 +2587,23 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -2630,6 +2804,19 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acquire @@ -2864,6 +3051,26 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in acq_rel @@ -3098,6 +3305,26 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in seq_cst @@ -3317,6 +3544,20 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3808,22 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3815,6 +4072,25 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4094,6 +4370,27 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4373,6 +4670,27 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4623,6 +4941,22 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4873,6 +5207,22 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5152,6 +5502,27 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5431,6 +5802,27 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5710,6 +6102,27 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5989,6 +6402,27 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6238,6 +6672,22 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6507,6 +6957,23 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6809,6 +7276,30 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7111,6 +7602,30 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7384,6 +7899,25 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7653,6 +8187,23 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7955,6 +8506,30 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8257,6 +8832,30 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8559,6 +9158,30 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8861,6 +9484,30 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9159,6 +9806,28 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9461,6 +10130,30 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9763,6 +10456,30 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10065,6 +10782,30 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10251,6 +10992,17 @@ define amdgpu_kernel void @global_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") unordered, align 4 @@ -10435,6 +11187,17 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") monotonic, align 4 @@ -10637,6 +11400,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") acquire, align 4 @@ -10858,6 +11633,24 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("one-as") seq_cst, align 4 @@ -11015,6 +11808,16 @@ define amdgpu_kernel void @global_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") unordered, align 4 @@ -11171,6 +11974,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") monotonic, align 4 @@ -11356,6 +12169,21 @@ define amdgpu_kernel void @global_system_one_as_release_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") release, align 4 @@ -11541,6 +12369,21 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("one-as") seq_cst, align 4 @@ -11695,6 +12538,16 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") monotonic @@ -11880,6 +12733,18 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12063,6 +12928,21 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") release @@ -12277,6 +13157,23 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -12491,6 +13388,23 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -12691,6 +13605,19 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acquire @@ -12925,6 +13852,26 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") acq_rel @@ -13159,6 +14106,26 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("one-as") seq_cst @@ -13378,6 +14345,20 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13628,6 +14609,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13876,6 +14873,25 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14155,6 +15171,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14434,6 +15471,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14684,6 +15742,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14934,6 +16008,22 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15213,6 +16303,27 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15492,6 +16603,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15771,6 +16903,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16050,6 +17203,27 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16329,6 +17503,27 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16608,6 +17803,27 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16887,6 +18103,27 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17166,6 +18403,27 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17415,6 +18673,22 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17684,6 +18958,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17964,6 +19255,27 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18266,6 +19578,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18568,6 +19904,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18841,6 +20201,25 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19110,6 +20489,23 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19412,6 +20808,30 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19714,6 +21134,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20016,6 +21460,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20318,6 +21786,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20616,6 +22108,28 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20918,6 +22432,30 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21220,6 +22758,30 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -21522,6 +23084,30 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: global_inv scope:SCOPE_SYS ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll index 8a5c5dda9f79..0d18963cbfb6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_volatile_load_0( ; GFX6-LABEL: global_volatile_load_0: @@ -146,6 +147,19 @@ define amdgpu_kernel void @global_volatile_load_0( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(1) %in, align 4 @@ -345,6 +359,23 @@ define amdgpu_kernel void @global_volatile_load_1( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s4, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_bvhcnt 0x0 +; GFX1250-NEXT: s_wait_samplecnt 0x0 +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -501,6 +532,19 @@ define amdgpu_kernel void @global_volatile_store_0( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -693,6 +737,21 @@ define amdgpu_kernel void @global_volatile_store_1( ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS ; GFX12-CU-NEXT: s_wait_storecnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s3, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s3 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -838,6 +897,17 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic volatile i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -969,6 +1039,17 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic volatile i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll index 151ba07a0b53..1aa8305b1a83 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX6-LABEL: global_wavefront_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") monotonic, align 4 @@ -558,6 +581,17 @@ define amdgpu_kernel void @global_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") acquire, align 4 @@ -742,6 +776,17 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront") seq_cst, align 4 @@ -899,6 +944,16 @@ define amdgpu_kernel void @global_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") unordered, align 4 @@ -1055,6 +1110,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") monotonic, align 4 @@ -1211,6 +1276,16 @@ define amdgpu_kernel void @global_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") release, align 4 @@ -1367,6 +1442,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront") seq_cst, align 4 @@ -1521,6 +1606,16 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") monotonic @@ -1675,6 +1770,16 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -1829,6 +1934,16 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") release @@ -1983,6 +2098,16 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2137,6 +2262,16 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2319,6 +2454,18 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acquire @@ -2502,6 +2649,18 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") acq_rel @@ -2685,6 +2844,18 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront") seq_cst @@ -2904,6 +3075,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3123,6 +3308,20 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3342,6 +3541,20 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3561,6 +3774,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3780,6 +4007,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3999,6 +4240,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4218,6 +4473,20 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4437,6 +4706,20 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4656,6 +4939,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4875,6 +5172,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5094,6 +5405,20 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5313,6 +5638,20 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5532,6 +5871,20 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5751,6 +6104,20 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5970,6 +6337,20 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6219,6 +6600,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6470,6 +6867,22 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6721,6 +7134,22 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6972,6 +7401,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7223,6 +7668,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7474,6 +7935,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7725,6 +8202,22 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7976,6 +8469,22 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8227,6 +8736,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8478,6 +9003,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8729,6 +9270,22 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8980,6 +9537,22 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9231,6 +9804,22 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9482,6 +10071,22 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9733,6 +10338,22 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9919,6 +10540,17 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") unordered, align 4 @@ -10103,6 +10735,17 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -10287,6 +10930,17 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") acquire, align 4 @@ -10471,6 +11125,17 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -10628,6 +11293,16 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") unordered, align 4 @@ -10784,6 +11459,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -10940,6 +11625,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") release, align 4 @@ -11096,6 +11791,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -11250,6 +11955,16 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -11404,6 +12119,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -11558,6 +12283,16 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") release @@ -11712,6 +12447,16 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -11866,6 +12611,16 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12048,6 +12803,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -12231,6 +12998,18 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -12414,6 +13193,18 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -12633,6 +13424,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -12852,6 +13657,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13071,6 +13890,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13290,6 +14123,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13509,6 +14356,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13728,6 +14589,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13947,6 +14822,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14166,6 +15055,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14385,6 +15288,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14604,6 +15521,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14823,6 +15754,20 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15042,6 +15987,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15261,6 +16220,20 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15480,6 +16453,20 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15699,6 +16686,20 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15948,6 +16949,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16199,6 +17216,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16450,6 +17483,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16701,6 +17750,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16952,6 +18017,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17203,6 +18284,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17454,6 +18551,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17705,6 +18818,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17956,6 +19085,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18207,6 +19352,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18458,6 +19619,22 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18709,6 +19886,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18960,6 +20153,22 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19211,6 +20420,22 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19462,6 +20687,22 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll index 69b0c7f93ab0..3eab16e6b971 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX6-LABEL: global_workgroup_unordered_load: @@ -190,6 +191,17 @@ define amdgpu_kernel void @global_workgroup_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") unordered, align 4 @@ -374,6 +386,17 @@ define amdgpu_kernel void @global_workgroup_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") monotonic, align 4 @@ -563,6 +586,17 @@ define amdgpu_kernel void @global_workgroup_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") acquire, align 4 @@ -764,6 +798,18 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup") seq_cst, align 4 @@ -921,6 +967,16 @@ define amdgpu_kernel void @global_workgroup_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") unordered, align 4 @@ -1077,6 +1133,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") monotonic, align 4 @@ -1251,6 +1317,17 @@ define amdgpu_kernel void @global_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") release, align 4 @@ -1425,6 +1502,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup") seq_cst, align 4 @@ -1579,6 +1667,16 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") monotonic @@ -1743,6 +1841,16 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -1915,6 +2023,17 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") release @@ -2097,6 +2216,17 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2279,6 +2409,17 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -2466,6 +2607,18 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acquire @@ -2674,6 +2827,19 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") acq_rel @@ -2882,6 +3048,19 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst @@ -3101,6 +3280,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3330,6 +3523,20 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3567,6 +3774,21 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -3814,6 +4036,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4061,6 +4298,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4290,6 +4542,20 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4519,6 +4785,20 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -4766,6 +5046,21 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5013,6 +5308,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5260,6 +5570,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5507,6 +5832,21 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -5754,6 +6094,21 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6001,6 +6356,21 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6248,6 +6618,21 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6495,6 +6880,21 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -6744,6 +7144,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7000,6 +7416,22 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7269,6 +7701,23 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7545,6 +7994,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -7821,6 +8287,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8079,6 +8562,22 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8335,6 +8834,22 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8611,6 +9126,23 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -8887,6 +9419,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9163,6 +9712,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9439,6 +10005,23 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9713,6 +10296,23 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -9989,6 +10589,23 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10265,6 +10882,23 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10541,6 +11175,23 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -10727,6 +11378,17 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") unordered, align 4 @@ -10911,6 +11573,17 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -11100,6 +11773,17 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") acquire, align 4 @@ -11297,6 +11981,17 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(1) %out) { entry: %val = load atomic i32, ptr addrspace(1) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -11454,6 +12149,16 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") unordered, align 4 @@ -11610,6 +12315,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -11776,6 +12491,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") release, align 4 @@ -11942,6 +12667,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(1) %out) { entry: store atomic i32 %in, ptr addrspace(1) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -12096,6 +12831,16 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -12260,6 +13005,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12424,6 +13179,16 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") release @@ -12598,6 +13363,16 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12772,6 +13547,16 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX12-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12959,6 +13744,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -13159,6 +13956,18 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -13359,6 +14168,18 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -13578,6 +14399,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -13807,6 +14642,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14036,6 +14885,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14275,6 +15138,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14514,6 +15391,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14743,6 +15634,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -14972,6 +15877,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15211,6 +16130,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15450,6 +16383,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15689,6 +16636,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -15928,6 +16889,20 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16167,6 +17142,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16406,6 +17395,20 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16645,6 +17648,20 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -16884,6 +17901,20 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-CU-NEXT: global_atomic_cmpswap_b32 v0, v[1:2], s[0:1] offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17133,6 +18164,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17389,6 +18436,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17650,6 +18713,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -17918,6 +18997,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18186,6 +19281,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18444,6 +19555,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18700,6 +19827,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -18968,6 +20111,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19236,6 +20395,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19504,6 +20679,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -19772,6 +20963,22 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20038,6 +21245,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20306,6 +21529,22 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20574,6 +21813,22 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 @@ -20842,6 +22097,22 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_loadcnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x8 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0xc +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v2, s3 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:16 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(1) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll index 0467c5047a0b..102616b9a206 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_agent_unordered_load( ; GFX6-LABEL: local_agent_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_agent_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_agent_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_agent_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") acquire, align 4 @@ -718,6 +755,19 @@ define amdgpu_kernel void @local_agent_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent") seq_cst, align 4 @@ -859,6 +909,16 @@ define amdgpu_kernel void @local_agent_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") unordered, align 4 @@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_agent_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") monotonic, align 4 @@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_agent_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") release, align 4 @@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_agent_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent") seq_cst, align 4 @@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") monotonic @@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_agent_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") release @@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acquire @@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") acq_rel @@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent") seq_cst @@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") unordered, align 4 @@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") monotonic, align 4 @@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") acquire, align 4 @@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("agent-one-as") seq_cst, align 4 @@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") unordered, align 4 @@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") monotonic, align 4 @@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_agent_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") release, align 4 @@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("agent-one-as") seq_cst, align 4 @@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") monotonic @@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") release @@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acquire @@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") acq_rel @@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("agent-one-as") seq_cst @@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll index 78209ee34cad..c6f7ce51f5ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_nontemporal_load_0( ; GFX6-LABEL: local_nontemporal_load_0: @@ -193,6 +194,18 @@ define amdgpu_kernel void @local_nontemporal_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_nontemporal_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load i32, ptr addrspace(3) %in, align 4, !nontemporal !0 @@ -428,6 +441,22 @@ define amdgpu_kernel void @local_nontemporal_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_nontemporal_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -597,6 +626,18 @@ define amdgpu_kernel void @local_nontemporal_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_nontemporal_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -802,6 +843,22 @@ define amdgpu_kernel void @local_nontemporal_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_nontemporal_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-NEXT: s_mov_b32 s1, 2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -991,6 +1048,18 @@ define amdgpu_kernel void @local_nontemporal_volatile_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_nontemporal_volatile_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4, !nontemporal !0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll index f84d451f8ecb..1800acbbf605 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX6-LABEL: local_singlethread_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_singlethread_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_singlethread_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_singlethread_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_singlethread_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("singlethread-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("singlethread-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("singlethread-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll index 74a297241d85..1356fe485417 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_system_unordered_load( ; GFX6-LABEL: local_system_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_system_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_system_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in monotonic, align 4 @@ -524,6 +549,18 @@ define amdgpu_kernel void @local_system_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in acquire, align 4 @@ -718,6 +755,19 @@ define amdgpu_kernel void @local_system_seq_cst_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in seq_cst, align 4 @@ -859,6 +909,16 @@ define amdgpu_kernel void @local_system_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out unordered, align 4 @@ -999,6 +1059,16 @@ define amdgpu_kernel void @local_system_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out monotonic, align 4 @@ -1157,6 +1227,17 @@ define amdgpu_kernel void @local_system_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out release, align 4 @@ -1315,6 +1396,17 @@ define amdgpu_kernel void @local_system_seq_cst_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out seq_cst, align 4 @@ -1455,6 +1547,16 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in monotonic @@ -1611,6 +1713,17 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -1769,6 +1882,17 @@ define amdgpu_kernel void @local_system_release_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in release @@ -1943,6 +2067,18 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2117,6 +2253,18 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2304,6 +2452,19 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acquire @@ -2510,6 +2671,20 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in acq_rel @@ -2716,6 +2891,20 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in seq_cst @@ -2883,6 +3072,18 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3066,6 +3267,19 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3251,6 +3465,19 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3452,6 +3679,20 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3653,6 +3894,20 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3836,6 +4091,19 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4019,6 +4287,19 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4220,6 +4501,20 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4421,6 +4716,20 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4622,6 +4931,20 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4823,6 +5146,20 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5024,6 +5361,20 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5225,6 +5576,20 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5426,6 +5791,20 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5627,6 +6006,20 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5836,6 +6229,21 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6052,6 +6460,21 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6281,6 +6704,22 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6515,6 +6954,22 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6749,6 +7204,22 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6965,6 +7436,21 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7181,6 +7667,21 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7415,6 +7916,22 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7649,6 +8166,22 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7883,6 +8416,22 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8117,6 +8666,22 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8351,6 +8916,22 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8585,6 +9166,22 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8819,6 +9416,22 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9053,6 +9666,22 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9226,6 +9855,18 @@ define amdgpu_kernel void @local_system_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") unordered, align 4 @@ -9397,6 +10038,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") monotonic, align 4 @@ -9568,6 +10221,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") acquire, align 4 @@ -9739,6 +10404,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("one-as") seq_cst, align 4 @@ -9880,6 +10557,16 @@ define amdgpu_kernel void @local_system_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") unordered, align 4 @@ -10020,6 +10707,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") monotonic, align 4 @@ -10160,6 +10857,16 @@ define amdgpu_kernel void @local_system_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") release, align 4 @@ -10300,6 +11007,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("one-as") seq_cst, align 4 @@ -10440,6 +11157,16 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") monotonic @@ -10580,6 +11307,16 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -10720,6 +11457,16 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") release @@ -10860,6 +11607,16 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11000,6 +11757,16 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11182,6 +11949,19 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acquire @@ -11365,6 +12145,19 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") acq_rel @@ -11548,6 +12341,19 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("one-as") seq_cst @@ -11715,6 +12521,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11882,6 +12700,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12049,6 +12879,18 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12216,6 +13058,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12383,6 +13237,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12550,6 +13416,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12717,6 +13595,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12884,6 +13774,18 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13051,6 +13953,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13218,6 +14132,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13385,6 +14311,18 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13552,6 +14490,18 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13719,6 +14669,18 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13886,6 +14848,18 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14053,6 +15027,18 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14262,6 +15248,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14473,6 +15474,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14684,6 +15700,21 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14895,6 +15926,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15106,6 +16152,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15317,6 +16378,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15528,6 +16604,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15739,6 +16830,21 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15950,6 +17056,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16161,6 +17282,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16372,6 +17508,21 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16583,6 +17734,21 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16794,6 +17960,21 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17005,6 +18186,21 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17216,6 +18412,21 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll index 5e5e3bf83d61..75e28f9008e2 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -8,6 +8,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_volatile_load_0( ; GFX6-LABEL: local_volatile_load_0: @@ -141,6 +142,18 @@ define amdgpu_kernel void @local_volatile_load_0( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_load_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %val = load volatile i32, ptr addrspace(3) %in, align 4 @@ -308,6 +321,22 @@ define amdgpu_kernel void @local_volatile_load_1( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_load_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: v_mov_b32_e32 v1, v0 +; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: s_mov_b32 s2, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v1, v1, s2 +; GFX1250-NEXT: s_mov_b32 s2, 2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshl_add_u32 v1, v1, s2, s3 +; GFX1250-NEXT: ds_load_b32 v1, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(1) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -429,6 +458,18 @@ define amdgpu_kernel void @local_volatile_store_0( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_store_0: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %val = load i32, ptr addrspace(1) %in, align 4 @@ -570,6 +611,22 @@ define amdgpu_kernel void @local_volatile_store_1( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_store_1: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[0:1], 0x0 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s1, 0x3ff +; GFX1250-NEXT: v_and_b32_e64 v0, v0, s1 +; GFX1250-NEXT: s_mov_b32 s1, 2 +; GFX1250-NEXT: v_lshl_add_u32 v0, v0, s1, s2 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(1) %in, ptr addrspace(3) %out) { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -698,6 +755,18 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load( ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic volatile i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -813,6 +882,17 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_volatile_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic volatile i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll index b24622a48a16..7e345ed6e271 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -12,6 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX6-LABEL: local_wavefront_unordered_load: @@ -177,6 +178,18 @@ define amdgpu_kernel void @local_wavefront_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") unordered, align 4 @@ -348,6 +361,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") monotonic, align 4 @@ -519,6 +544,18 @@ define amdgpu_kernel void @local_wavefront_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") acquire, align 4 @@ -690,6 +727,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront") seq_cst, align 4 @@ -831,6 +880,16 @@ define amdgpu_kernel void @local_wavefront_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") unordered, align 4 @@ -971,6 +1030,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") monotonic, align 4 @@ -1111,6 +1180,16 @@ define amdgpu_kernel void @local_wavefront_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") release, align 4 @@ -1251,6 +1330,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront") seq_cst, align 4 @@ -1391,6 +1480,16 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") monotonic @@ -1531,6 +1630,16 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -1671,6 +1780,16 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") release @@ -1811,6 +1930,16 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -1951,6 +2080,16 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2133,6 +2272,19 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acquire @@ -2316,6 +2468,19 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") acq_rel @@ -2499,6 +2664,19 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront") seq_cst @@ -2666,6 +2844,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -2833,6 +3023,18 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3000,6 +3202,18 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3167,6 +3381,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3334,6 +3560,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3501,6 +3739,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3668,6 +3918,18 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3835,6 +4097,18 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4002,6 +4276,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4169,6 +4455,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4336,6 +4634,18 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4503,6 +4813,18 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4670,6 +4992,18 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4837,6 +5171,18 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5004,6 +5350,18 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5213,6 +5571,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5424,6 +5797,21 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5635,6 +6023,21 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5846,6 +6249,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6057,6 +6475,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6268,6 +6701,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6479,6 +6927,21 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6690,6 +7153,21 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6901,6 +7379,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7112,6 +7605,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7323,6 +7831,21 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7534,6 +8057,21 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7745,6 +8283,21 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7956,6 +8509,21 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,6 +8735,21 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8340,6 +8923,18 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") unordered, align 4 @@ -8511,6 +9106,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") monotonic, align 4 @@ -8682,6 +9289,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") acquire, align 4 @@ -8853,6 +9472,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("wavefront-one-as") seq_cst, align 4 @@ -8994,6 +9625,16 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") unordered, align 4 @@ -9134,6 +9775,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") monotonic, align 4 @@ -9274,6 +9925,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") release, align 4 @@ -9414,6 +10075,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("wavefront-one-as") seq_cst, align 4 @@ -9554,6 +10225,16 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") monotonic @@ -9694,6 +10375,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -9834,6 +10525,16 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") release @@ -9974,6 +10675,16 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10114,6 +10825,16 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s0 ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10296,6 +11017,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acquire @@ -10479,6 +11213,19 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") acq_rel @@ -10662,6 +11409,19 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("wavefront-one-as") seq_cst @@ -10829,6 +11589,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -10996,6 +11768,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11163,6 +11947,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11330,6 +12126,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11497,6 +12305,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11664,6 +12484,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11831,6 +12663,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -11998,6 +12842,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12165,6 +13021,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12332,6 +13200,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12499,6 +13379,18 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12666,6 +13558,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12833,6 +13737,18 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13000,6 +13916,18 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13167,6 +14095,18 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13376,6 +14316,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13587,6 +14542,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13798,6 +14768,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14009,6 +14994,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14220,6 +15220,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14431,6 +15446,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14642,6 +15672,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14853,6 +15898,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15064,6 +16124,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15275,6 +16350,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15486,6 +16576,21 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15697,6 +16802,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15908,6 +17028,21 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16119,6 +17254,21 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16330,6 +17480,21 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm +; +; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll index 94f5aab1eb67..6aaf9d323b1f 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -12,7 +12,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250-CU %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GFX1250 %s define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX6-LABEL: local_workgroup_unordered_load: @@ -179,17 +179,17 @@ define amdgpu_kernel void @local_workgroup_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") unordered, align 4 @@ -362,17 +362,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") monotonic, align 4 @@ -550,17 +550,17 @@ define amdgpu_kernel void @local_workgroup_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") acquire, align 4 @@ -756,18 +756,18 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup") seq_cst, align 4 @@ -910,15 +910,15 @@ define amdgpu_kernel void @local_workgroup_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") unordered, align 4 @@ -1060,15 +1060,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") monotonic, align 4 @@ -1228,16 +1228,16 @@ define amdgpu_kernel void @local_workgroup_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") release, align 4 @@ -1397,16 +1397,16 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup") seq_cst, align 4 @@ -1548,15 +1548,15 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") monotonic @@ -1714,16 +1714,16 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire @@ -1883,16 +1883,16 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") release @@ -2068,17 +2068,17 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2254,17 +2254,17 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -2453,18 +2453,18 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acquire @@ -2672,19 +2672,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") acq_rel @@ -2892,19 +2892,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup") seq_cst @@ -3073,17 +3073,17 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3268,18 +3268,18 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3466,18 +3466,18 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3680,19 +3680,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -3895,19 +3895,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4092,18 +4092,18 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4288,18 +4288,18 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4502,19 +4502,19 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4717,19 +4717,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -4932,19 +4932,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5147,19 +5147,19 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5362,19 +5362,19 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5577,19 +5577,19 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -5792,19 +5792,19 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6007,19 +6007,19 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: s_wait_dscnt 0x0 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6230,20 +6230,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6461,20 +6461,20 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6705,21 +6705,21 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -6955,21 +6955,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7205,21 +7205,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7437,20 +7437,20 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7668,20 +7668,20 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -7917,21 +7917,21 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8167,21 +8167,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8417,21 +8417,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8667,21 +8667,21 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -8917,21 +8917,21 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9167,21 +9167,21 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9417,21 +9417,21 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9667,21 +9667,21 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -9856,17 +9856,17 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_unordered_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") unordered, align 4 @@ -10039,17 +10039,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") monotonic, align 4 @@ -10222,17 +10222,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") acquire, align 4 @@ -10405,17 +10405,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_load: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: ds_load_b32 v1, v0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: ds_load_b32 v1, v0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %in, ptr addrspace(3) %out) { entry: %val = load atomic i32, ptr addrspace(3) %in syncscope("workgroup-one-as") seq_cst, align 4 @@ -10558,15 +10558,15 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_unordered_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_unordered_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") unordered, align 4 @@ -10708,15 +10708,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") monotonic, align 4 @@ -10858,15 +10858,15 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") release, align 4 @@ -11008,15 +11008,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_store: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_store: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm i32 %in, ptr addrspace(3) %out) { entry: store atomic i32 %in, ptr addrspace(3) %out syncscope("workgroup-one-as") seq_cst, align 4 @@ -11158,15 +11158,15 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") monotonic @@ -11308,15 +11308,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -11458,15 +11458,15 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") release @@ -11608,15 +11608,15 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -11758,15 +11758,15 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw( ; GFX12-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s1 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -11950,18 +11950,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acquire @@ -12146,18 +12146,18 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") acq_rel @@ -12342,18 +12342,18 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in) { entry: %val = atomicrmw volatile xchg ptr addrspace(3) %out, i32 %in syncscope("workgroup-one-as") seq_cst @@ -12522,17 +12522,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12701,17 +12701,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -12880,17 +12880,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13059,17 +13059,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13238,17 +13238,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13417,17 +13417,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13596,17 +13596,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13775,17 +13775,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -13954,17 +13954,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14133,17 +14133,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14312,17 +14312,17 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14491,17 +14491,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14670,17 +14670,17 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -14849,17 +14849,17 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15028,17 +15028,17 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg( ; GFX12-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s0 -; GFX1250-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s2 +; GFX1250-NEXT: v_mov_b32_e32 v1, s1 +; GFX1250-NEXT: v_mov_b32_e32 v2, s0 +; GFX1250-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15249,20 +15249,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15475,20 +15475,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15701,20 +15701,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -15927,20 +15927,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16153,20 +16153,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16379,20 +16379,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16605,20 +16605,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -16831,20 +16831,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17057,20 +17057,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17283,20 +17283,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17509,20 +17509,20 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17735,20 +17735,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -17961,20 +17961,20 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18187,20 +18187,20 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4 @@ -18413,20 +18413,20 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg( ; GFX12-CU-NEXT: ds_store_b32 v0, v1 ; GFX12-CU-NEXT: s_endpgm ; -; GFX1250-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: -; GFX1250-CU: ; %bb.0: ; %entry -; GFX1250-CU-NEXT: s_load_b32 s0, s[4:5], 0x0 -; GFX1250-CU-NEXT: s_load_b32 s2, s[4:5], 0x4 -; GFX1250-CU-NEXT: s_load_b32 s1, s[4:5], 0x8 -; GFX1250-CU-NEXT: s_wait_kmcnt 0x0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: v_mov_b32_e32 v1, s2 -; GFX1250-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX1250-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX1250-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX1250-CU-NEXT: s_wait_dscnt 0x0 -; GFX1250-CU-NEXT: ds_store_b32 v0, v1 -; GFX1250-CU-NEXT: s_endpgm +; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 +; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x4 +; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s2 +; GFX1250-NEXT: v_mov_b32_e32 v2, s1 +; GFX1250-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 +; GFX1250-NEXT: v_mov_b32_e32 v0, s0 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: ds_store_b32 v0, v1 +; GFX1250-NEXT: s_endpgm ptr addrspace(3) %out, i32 %in, i32 %old) { entry: %gep = getelementptr i32, ptr addrspace(3) %out, i32 4