From b50cf35d57480a5523f2c9997b07ad8e274961bb Mon Sep 17 00:00:00 2001 From: Syadus Sefat <42645939+mssefat@users.noreply.github.com> Date: Tue, 10 Mar 2026 14:53:32 -0500 Subject: [PATCH] [AMDGPU][GlobalIsel] Add register bank legalization rules for amdgcn atomic fminmax num (#184564) This patch adds register bank legalization rules for amdgcn global/flat atomic fmin/fmax num operations in the AMDGPU GlobalISel pipeline. --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 8 + .../AMDGPU/fp-min-max-num-flat-atomics.ll | 175 +++++++++++++++--- .../AMDGPU/fp-min-max-num-global-atomics.ll | 94 +++++++--- 3 files changed, 221 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index f3264536006d..51fcf9b6a82e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -1505,6 +1505,14 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64}) .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}}); + addRulesForIOpcs( + {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard) + .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}}); + + addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num}, + Standard) + .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}}); + addRulesForIOpcs({amdgcn_raw_buffer_load_lds}) .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}}); diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll index 874aa543a214..4912f59f77d1 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-flat-atomics.ll @@ -1,53 +1,168 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_setreg_imm32_b32" --version 6 +; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GCN,GFX12,GFX12-SDAG +; RUN: llc < %s -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GCN,GFX12,GFX12-GISEL +; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GCN,GFX1250,GFX1250-SDAG +; RUN: llc < %s -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GCN,GFX1250,GFX1250-GISEL declare float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) declare float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) define amdgpu_cs void @flat_atomic_fmin_num_f32_noret(ptr %ptr, float %data) { -; GFX12-LABEL: flat_atomic_fmin_num_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: flat_atomic_fmin_num_f32_noret: +; GCN: ; %bb.0: +; GCN: flat_atomic_min_num_f32 v[0:1], v2 +; GCN: s_endpgm %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) ret void } define amdgpu_cs void @flat_atomic_fmax_num_f32_noret(ptr %ptr, float %data) { -; GFX12-LABEL: flat_atomic_fmax_num_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 -; GFX12-NEXT: s_endpgm +; GCN-LABEL: flat_atomic_fmax_num_f32_noret: +; GCN: ; %bb.0: +; GCN: flat_atomic_max_num_f32 v[0:1], v2 +; GCN: s_endpgm %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) ret void } -define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(ptr %ptr, float %data, ptr %out) { +define amdgpu_cs float @flat_atomic_fmin_num_f32_rtn(float %data, ptr %ptr) { ; GFX12-LABEL: flat_atomic_fmin_num_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12: ; %bb.0: +; GFX12: flat_atomic_min_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12: s_wait_loadcnt_dscnt 0x0 +; GFX12: ; return to shader part epilog +; +; GFX1250-SDAG-LABEL: flat_atomic_fmin_num_f32_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG: flat_atomic_min_num_f32 v0, v[2:3], v0 th:TH_ATOMIC_RETURN +; GFX1250-SDAG: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_atomic_fmin_num_f32_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL: flat_atomic_min_num_f32 v0, v[4:5], v0 th:TH_ATOMIC_RETURN +; GFX1250-GISEL: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL: ; return to shader part epilog %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) - store float %ret, ptr %out ret float %ret } -define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(ptr %ptr, float %data, ptr %out) { +define amdgpu_cs float @flat_atomic_fmax_num_f32_rtn(float %data, ptr %ptr) { ; GFX12-LABEL: flat_atomic_fmax_num_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: flat_store_b32 v[3:4], v0 -; GFX12-NEXT: s_wait_dscnt 0x0 -; GFX12-NEXT: ; return to shader part epilog +; GFX12: ; %bb.0: +; GFX12: flat_atomic_max_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12: s_wait_loadcnt_dscnt 0x0 +; GFX12: ; return to shader part epilog +; +; GFX1250-SDAG-LABEL: flat_atomic_fmax_num_f32_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG: flat_atomic_max_num_f32 v0, v[2:3], v0 th:TH_ATOMIC_RETURN +; GFX1250-SDAG: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_atomic_fmax_num_f32_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL: flat_atomic_max_num_f32 v0, v[4:5], v0 th:TH_ATOMIC_RETURN +; GFX1250-GISEL: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) + ret float %ret +} + +define amdgpu_ps void @flat_atomic_fmin_num_f32_noret_saddr(ptr inreg %ptr, float %data) { +; GFX12-SDAG-LABEL: flat_atomic_fmin_num_f32_noret_saddr: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-SDAG: flat_atomic_min_num_f32 v[1:2], v0 +; GFX12-SDAG: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_fmin_num_f32_noret_saddr: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL: flat_atomic_min_num_f32 v[1:2], v0 +; GFX12-GISEL: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fmin_num_f32_noret_saddr: +; GFX1250: ; %bb.0: +; GFX1250: v_mov_b32_e32 v1, 0 +; GFX1250: flat_atomic_min_num_f32 v1, v0, s[0:1] +; GFX1250: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_ps void @flat_atomic_fmax_num_f32_noret_saddr(ptr inreg %ptr, float %data) { +; GFX12-SDAG-LABEL: flat_atomic_fmax_num_f32_noret_saddr: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-SDAG: flat_atomic_max_num_f32 v[1:2], v0 +; GFX12-SDAG: s_endpgm +; +; GFX12-GISEL-LABEL: flat_atomic_fmax_num_f32_noret_saddr: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL: flat_atomic_max_num_f32 v[1:2], v0 +; GFX12-GISEL: s_endpgm +; +; GFX1250-LABEL: flat_atomic_fmax_num_f32_noret_saddr: +; GFX1250: ; %bb.0: +; GFX1250: v_mov_b32_e32 v1, 0 +; GFX1250: flat_atomic_max_num_f32 v1, v0, s[0:1] +; GFX1250: s_endpgm + %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) + ret void +} + +define amdgpu_ps float @flat_atomic_fmin_num_f32_rtn_saddr(ptr inreg %ptr, float %data) { +; GFX12-SDAG-LABEL: flat_atomic_fmin_num_f32_rtn_saddr: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-SDAG: flat_atomic_min_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12-SDAG: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: flat_atomic_fmin_num_f32_rtn_saddr: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL: flat_atomic_min_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12-GISEL: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL: ; return to shader part epilog +; +; GFX1250-LABEL: flat_atomic_fmin_num_f32_rtn_saddr: +; GFX1250: ; %bb.0: +; GFX1250: v_mov_b32_e32 v1, 0 +; GFX1250: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250: s_wait_loadcnt_dscnt 0x0 +; GFX1250: ; return to shader part epilog + %ret = call float @llvm.amdgcn.flat.atomic.fmin.num.f32.p1.f32(ptr %ptr, float %data) + ret float %ret +} + +define amdgpu_ps float @flat_atomic_fmax_num_f32_rtn_saddr(ptr inreg %ptr, float %data) { +; GFX12-SDAG-LABEL: flat_atomic_fmax_num_f32_rtn_saddr: +; GFX12-SDAG: ; %bb.0: +; GFX12-SDAG: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1 +; GFX12-SDAG: flat_atomic_max_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12-SDAG: s_wait_loadcnt_dscnt 0x0 +; GFX12-SDAG: ; return to shader part epilog +; +; GFX12-GISEL-LABEL: flat_atomic_fmax_num_f32_rtn_saddr: +; GFX12-GISEL: ; %bb.0: +; GFX12-GISEL: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; GFX12-GISEL: flat_atomic_max_num_f32 v0, v[1:2], v0 th:TH_ATOMIC_RETURN +; GFX12-GISEL: s_wait_loadcnt_dscnt 0x0 +; GFX12-GISEL: ; return to shader part epilog +; +; GFX1250-LABEL: flat_atomic_fmax_num_f32_rtn_saddr: +; GFX1250: ; %bb.0: +; GFX1250: v_mov_b32_e32 v1, 0 +; GFX1250: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] th:TH_ATOMIC_RETURN +; GFX1250: s_wait_loadcnt_dscnt 0x0 +; GFX1250: ; return to shader part epilog %ret = call float @llvm.amdgcn.flat.atomic.fmax.num.f32.p1.f32(ptr %ptr, float %data) - store float %ret, ptr %out ret float %ret } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-GISEL: {{.*}} -; GFX12-SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll index d5250581a6ca..2aa88b42154a 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-num-global-atomics.ll @@ -1,51 +1,93 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG -; RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "s_setreg_imm32_b32" --version 6 +; RUN: llc < %s -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GCN declare float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) declare float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) define amdgpu_cs void @global_atomic_fmin_num_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX12-LABEL: global_atomic_fmin_num_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: global_atomic_fmin_num_f32_noret: +; GCN: ; %bb.0: +; GCN: global_atomic_min_num_f32 v[0:1], v2, off +; GCN: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) ret void } define amdgpu_cs void @global_atomic_fmax_num_f32_noret(ptr addrspace(1) %ptr, float %data) { -; GFX12-LABEL: global_atomic_fmax_num_f32_noret: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: global_atomic_fmax_num_f32_noret: +; GCN: ; %bb.0: +; GCN: global_atomic_max_num_f32 v[0:1], v2, off +; GCN: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) ret void } define amdgpu_cs void @global_atomic_fmax_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { -; GFX12-LABEL: global_atomic_fmax_num_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: global_atomic_fmax_num_f32_rtn: +; GCN: ; %bb.0: +; GCN: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GCN: s_wait_loadcnt 0x0 +; GCN: global_store_b32 v[3:4], v0, off +; GCN: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) store float %ret, ptr addrspace(1) %out ret void } define amdgpu_cs void @global_atomic_fmin_num_f32_rtn(ptr addrspace(1) %ptr, float %data, ptr addrspace(1) %out) { -; GFX12-LABEL: global_atomic_fmin_num_f32_rtn: -; GFX12: ; %bb.0: -; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b32 v[3:4], v0, off -; GFX12-NEXT: s_endpgm +; GCN-LABEL: global_atomic_fmin_num_f32_rtn: +; GCN: ; %bb.0: +; GCN: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN +; GCN: s_wait_loadcnt 0x0 +; GCN: global_store_b32 v[3:4], v0, off +; GCN: s_endpgm %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) store float %ret, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX12-GISEL: {{.*}} -; GFX12-SDAG: {{.*}} + +define amdgpu_ps void @global_atomic_fmin_num_f32_noret_saddr(ptr addrspace(1) inreg %ptr, float %data) { +; GCN-LABEL: global_atomic_fmin_num_f32_noret_saddr: +; GCN: ; %bb.0: +; GCN: v_mov_b32_e32 v1, 0 +; GCN: global_atomic_min_num_f32 v1, v0, s[0:1] +; GCN: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_ps void @global_atomic_fmax_num_f32_noret_saddr(ptr addrspace(1) inreg %ptr, float %data) { +; GCN-LABEL: global_atomic_fmax_num_f32_noret_saddr: +; GCN: ; %bb.0: +; GCN: v_mov_b32_e32 v1, 0 +; GCN: global_atomic_max_num_f32 v1, v0, s[0:1] +; GCN: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + ret void +} + +define amdgpu_ps void @global_atomic_fmin_num_f32_rtn_saddr(ptr addrspace(1) inreg %ptr, float %data, ptr addrspace(1) %out) { +; GCN-LABEL: global_atomic_fmin_num_f32_rtn_saddr: +; GCN: ; %bb.0: +; GCN: v_mov_b32_e32 v3, 0 +; GCN: global_atomic_min_num_f32 v0, v3, v0, s[0:1] th:TH_ATOMIC_RETURN +; GCN: s_wait_loadcnt 0x0 +; GCN: global_store_b32 v[1:2], v0, off +; GCN: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmin.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @global_atomic_fmax_num_f32_rtn_saddr(ptr addrspace(1) inreg %ptr, float %data, ptr addrspace(1) %out) { +; GCN-LABEL: global_atomic_fmax_num_f32_rtn_saddr: +; GCN: ; %bb.0: +; GCN: v_mov_b32_e32 v3, 0 +; GCN: global_atomic_max_num_f32 v0, v3, v0, s[0:1] th:TH_ATOMIC_RETURN +; GCN: s_wait_loadcnt 0x0 +; GCN: global_store_b32 v[1:2], v0, off +; GCN: s_endpgm + %ret = call float @llvm.amdgcn.global.atomic.fmax.num.f32.p1.f32(ptr addrspace(1) %ptr, float %data) + store float %ret, ptr addrspace(1) %out + ret void +}