From 61f272d5cc75ccecfdcd83b3e3002b30ccce9918 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Tue, 27 Jan 2026 10:24:05 +0900 Subject: [PATCH] [AMDGPU] Pre-GFX10 does not need added latency for workgroup fences (#177157) Wait counts will not typically be introduced for workgroup scope fences in pre-GFX10 ASICs. Hence avoid adding scheduling latency for these. --- .../Target/AMDGPU/AMDGPUBarrierLatency.cpp | 7 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + .../AMDGPU/schedule-barrier-latency-gfx9.mir | 190 ++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp index 223e7650f749..c9fcec8a4bbd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUBarrierLatency.cpp @@ -21,6 +21,7 @@ //===----------------------------------------------------------------------===// #include "AMDGPUBarrierLatency.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "llvm/CodeGen/ScheduleDAGInstrs.h" @@ -47,6 +48,12 @@ public: IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront")); IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as")); IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as")); + + const GCNSubtarget &ST = MF->getSubtarget(); + if (!ST.requiresWaitOnWorkgroupReleaseFence()) { + // Prior to GFX10 workgroup scope does not normally require waitcnts + IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup")); + } } void apply(ScheduleDAGInstrs *DAG) override; }; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8e473cb57327..d27a7384a7da 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1008,6 +1008,10 @@ public: bool useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; } + + bool requiresWaitOnWorkgroupReleaseFence() const { + return getGeneration() >= GFX10 || isTgSplitEnabled(); + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir new file mode 100644 index 000000000000..7be5b164cd1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir @@ -0,0 +1,190 @@ +# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -mattr=+tgsplit -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9-TGS %s + +# Check workgroup fences on GFX9 do not add scheduling latency. +# s_barrier should occur before s_waitcnts to hide load latency. +# Latency should still be added when tg-split is enabled. +# This allows merging of pre-barrier atomic fence with waits on loads. + +# LLVM IR to help syncscope IDs match MIR +# SSID 2 = workgroup +# SSID 3 = wavefront +--- | + define amdgpu_kernel void @test_workgroup() { + ; GFX9-LABEL: test_workgroup: + ; GFX9: ; %bb.0: + ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 + ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 + ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GFX9-NEXT: global_load_ushort v14, v[0:1], off + ; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5 + ; GFX9-NEXT: global_load_ushort v15, v[4:5], off + ; GFX9-NEXT: ; implicit-def: $vgpr2 + ; GFX9-NEXT: v_add_u32_e32 v0, 1, v2 + ; GFX9-NEXT: ; implicit-def: $vgpr7 + ; GFX9-NEXT: v_mul_hi_u32 v1, v0, v7 + ; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1 + ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2 + ; GFX9-NEXT: v_add_u32_e32 v1, v2, v1 + ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 14, v1 + ; GFX9-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1 + ; GFX9-NEXT: ; implicit-def: $vgpr6 + ; GFX9-NEXT: v_add3_u32 v2, v6, v1, v0 + ; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] + ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[4:5] + ; GFX9-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5] + ; GFX9-NEXT: global_load_ushort v17, v[2:3], off + ; GFX9-NEXT: global_load_ushort v18, v[0:1], off + ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 + ; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9 + ; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11 + ; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GFX9-NEXT: ; implicit-def: $vgpr12_vgpr13 + ; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7 + ; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3 + ; GFX9-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5] + ; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3] + ; GFX9-NEXT: s_barrier + ; GFX9-NEXT: s_waitcnt vmcnt(3) + ; GFX9-NEXT: v_cvt_f32_f16_e32 v14, v14 + ; GFX9-NEXT: s_waitcnt vmcnt(2) + ; GFX9-NEXT: v_cvt_f32_f16_e32 v16, v15 + ; GFX9-NEXT: s_waitcnt vmcnt(1) + ; GFX9-NEXT: v_cvt_f32_f16_e32 v17, v17 + ; GFX9-NEXT: s_waitcnt vmcnt(0) + ; GFX9-NEXT: v_cvt_f32_f16_e32 v15, v18 + ; GFX9-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1] + ; GFX9-NEXT: s_nop 0 + ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1] + ; GFX9-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1] + ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1] + ; GFX9-NEXT: s_nop 0 + ; GFX9-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1] + ; GFX9-NEXT: s_nop 0 + ; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1] + ; GFX9-NEXT: s_nop 0 + ; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 + ; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 + ; GFX9-NEXT: global_store_short v[2:3], v0, off + ; GFX9-NEXT: global_store_short v[4:5], v1, off + ; + ; GFX9-TGS-LABEL: test_workgroup: + ; GFX9-TGS: ; %bb.0: + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3 + ; GFX9-TGS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + ; GFX9-TGS-NEXT: global_load_ushort v14, v[0:1], off + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr4_vgpr5 + ; GFX9-TGS-NEXT: global_load_ushort v15, v[4:5], off + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2 + ; GFX9-TGS-NEXT: v_add_u32_e32 v0, 1, v2 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr7 + ; GFX9-TGS-NEXT: v_mul_hi_u32 v1, v0, v7 + ; GFX9-TGS-NEXT: v_sub_u32_e32 v2, v0, v1 + ; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v2, 1, v2 + ; GFX9-TGS-NEXT: v_add_u32_e32 v1, v2, v1 + ; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v1, 14, v1 + ; GFX9-TGS-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr6 + ; GFX9-TGS-NEXT: v_add3_u32 v2, v6, v1, v0 + ; GFX9-TGS-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3] + ; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5] + ; GFX9-TGS-NEXT: global_load_ushort v18, v[0:1], off + ; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[4:5] + ; GFX9-TGS-NEXT: global_load_ushort v17, v[0:1], off + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr8_vgpr9 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr10_vgpr11 + ; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr12_vgpr13 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr6_vgpr7 + ; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3 + ; GFX9-TGS-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5] + ; GFX9-TGS-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3] + ; GFX9-TGS-NEXT: s_waitcnt vmcnt(3) + ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v14, v14 + ; GFX9-TGS-NEXT: s_waitcnt vmcnt(2) + ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v16, v15 + ; GFX9-TGS-NEXT: s_waitcnt vmcnt(1) + ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v15, v18 + ; GFX9-TGS-NEXT: s_waitcnt vmcnt(0) + ; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v17, v17 + ; GFX9-TGS-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1] + ; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1] + ; GFX9-TGS-NEXT: s_nop 0 + ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1] + ; GFX9-TGS-NEXT: s_nop 0 + ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1] + ; GFX9-TGS-NEXT: s_barrier + ; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1] + ; GFX9-TGS-NEXT: buffer_inv sc0 + ; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1] + ; GFX9-TGS-NEXT: s_nop 0 + ; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v0, v0 + ; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v1, v1 + ; GFX9-TGS-NEXT: global_store_short v[2:3], v0, off + ; GFX9-TGS-NEXT: global_store_short v[4:5], v1, off + fence syncscope("workgroup") acq_rel + fence syncscope("wavefront") acq_rel + ret void + } +... + +--- +name: test_workgroup +tracksRegLiveness: true +body: | + bb.0: + %0:sgpr_256 = IMPLICIT_DEF + %1:vgpr_32 = IMPLICIT_DEF + %2:vreg_96_align2 = IMPLICIT_DEF + %3:vgpr_32 = IMPLICIT_DEF + %4:vreg_64_align2 = IMPLICIT_DEF + %5:vreg_64_align2 = IMPLICIT_DEF + %6:vreg_64_align2 = IMPLICIT_DEF + %7:sgpr_64 = IMPLICIT_DEF + %8:vreg_64_align2 = IMPLICIT_DEF + %9:vreg_64_align2 = IMPLICIT_DEF + %10:vreg_64_align2 = IMPLICIT_DEF + %11:vreg_64_align2 = IMPLICIT_DEF + %12:vgpr_32 = IMPLICIT_DEF + %13:vgpr_32 = GLOBAL_LOAD_USHORT %5:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) + %14:vreg_64_align2 = IMPLICIT_DEF + %15:vgpr_32 = GLOBAL_LOAD_USHORT %14:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) + %16:vgpr_32 = V_ADD_U32_e32 1, %12:vgpr_32, implicit $exec + %17:vgpr_32 = V_MUL_HI_U32_e64 %16:vgpr_32, %3:vgpr_32, implicit $exec + %18:vgpr_32 = V_SUB_U32_e32 %16:vgpr_32, %17:vgpr_32, implicit $exec + %19:vgpr_32 = V_LSHRREV_B32_e32 1, %18:vgpr_32, implicit $exec + %20:vgpr_32 = V_ADD_U32_e32 %19:vgpr_32, %17:vgpr_32, implicit $exec + %21:vgpr_32 = V_LSHRREV_B32_e32 14, %20:vgpr_32, implicit $exec + %22:vgpr_32 = V_MUL_U32_U24_e32 1361367, %21:vgpr_32, implicit $exec + %6.sub0:vreg_64_align2 = V_ADD3_U32_e64 %1:vgpr_32, %22:vgpr_32, %16:vgpr_32, implicit $exec + %23:vreg_64_align2 = nuw nsw V_LSHLREV_B64_e64 1, %6:vreg_64_align2, implicit $exec + %24:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub2_sub3:sgpr_256, 0, %23:vreg_64_align2, implicit $exec + %25:vgpr_32 = GLOBAL_LOAD_USHORT %24:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) + %26:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub0_sub1:sgpr_256, 0, %23:vreg_64_align2, implicit $exec + %27:vgpr_32 = GLOBAL_LOAD_USHORT %26:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1) + undef %28.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %13:vgpr_32, implicit $mode, implicit $exec + %28.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %25:vgpr_32, implicit $mode, implicit $exec + undef %29.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %15:vgpr_32, implicit $mode, implicit $exec + %29.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %27:vgpr_32, implicit $mode, implicit $exec + %30:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %29:vreg_64_align2, 11, %2.sub0_sub1:vreg_96_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, %7:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %32:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %9:vreg_64_align2, 8, %30:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %33:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %11:vreg_64_align2, 8, %32:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %34:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %31:vreg_64_align2, 8, %33:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + %35:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %8:vreg_64_align2, 8, %34:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ATOMIC_FENCE 5, 2 + S_BARRIER + ATOMIC_FENCE 4, 2 + %36:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub0:vreg_64_align2, implicit $mode, implicit $exec + %37:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %4:vreg_64_align2, implicit $exec + GLOBAL_STORE_SHORT %37:vreg_64_align2, %36:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1) + %38:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub1:vreg_64_align2, implicit $mode, implicit $exec + %39:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %23:vreg_64_align2, implicit $exec + GLOBAL_STORE_SHORT %39:vreg_64_align2, %38:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1) + +...