[AMDGPU] Pre-GFX10 does not need added latency for workgroup fences (#177157)
Wait counts will not typically be introduced for workgroup scope fences in pre-GFX10 ASICs. Hence avoid adding scheduling latency for these.
This commit is contained in:
parent
667703ed1b
commit
61f272d5cc
@ -21,6 +21,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPUBarrierLatency.h"
|
||||
#include "GCNSubtarget.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "llvm/CodeGen/ScheduleDAGInstrs.h"
|
||||
@ -47,6 +48,12 @@ public:
|
||||
IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
|
||||
IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
|
||||
IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
|
||||
|
||||
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
|
||||
if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
|
||||
// Prior to GFX10 workgroup scope does not normally require waitcnts
|
||||
IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));
|
||||
}
|
||||
}
|
||||
void apply(ScheduleDAGInstrs *DAG) override;
|
||||
};
|
||||
|
||||
@ -1008,6 +1008,10 @@ public:
|
||||
bool useRealTrue16Insts() const {
|
||||
return hasTrue16BitInsts() && EnableRealTrue16Insts;
|
||||
}
|
||||
|
||||
bool requiresWaitOnWorkgroupReleaseFence() const {
|
||||
return getGeneration() >= GFX10 || isTgSplitEnabled();
|
||||
}
|
||||
};
|
||||
|
||||
class GCNUserSGPRUsageInfo {
|
||||
|
||||
190
llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir
Normal file
190
llvm/test/CodeGen/AMDGPU/schedule-barrier-latency-gfx9.mir
Normal file
@ -0,0 +1,190 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -mattr=+tgsplit -start-before=machine-scheduler -o - %s | FileCheck -check-prefix=GFX9-TGS %s
|
||||
|
||||
# Check workgroup fences on GFX9 do not add scheduling latency.
|
||||
# s_barrier should occur before s_waitcnts to hide load latency.
|
||||
# Latency should still be added when tg-split is enabled.
|
||||
# This allows merging of pre-barrier atomic fence with waits on loads.
|
||||
|
||||
# LLVM IR to help syncscope IDs match MIR
|
||||
# SSID 2 = workgroup
|
||||
# SSID 3 = wavefront
|
||||
--- |
|
||||
define amdgpu_kernel void @test_workgroup() {
|
||||
; GFX9-LABEL: test_workgroup:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-NEXT: global_load_ushort v14, v[0:1], off
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX9-NEXT: global_load_ushort v15, v[4:5], off
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr2
|
||||
; GFX9-NEXT: v_add_u32_e32 v0, 1, v2
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr7
|
||||
; GFX9-NEXT: v_mul_hi_u32 v1, v0, v7
|
||||
; GFX9-NEXT: v_sub_u32_e32 v2, v0, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v2
|
||||
; GFX9-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 14, v1
|
||||
; GFX9-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr6
|
||||
; GFX9-NEXT: v_add3_u32 v2, v6, v1, v0
|
||||
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3]
|
||||
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[4:5]
|
||||
; GFX9-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5]
|
||||
; GFX9-NEXT: global_load_ushort v17, v[2:3], off
|
||||
; GFX9-NEXT: global_load_ushort v18, v[0:1], off
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11
|
||||
; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr12_vgpr13
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr6_vgpr7
|
||||
; GFX9-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5]
|
||||
; GFX9-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3]
|
||||
; GFX9-NEXT: s_barrier
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v14, v14
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v16, v15
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v17, v17
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: v_cvt_f32_f16_e32 v15, v18
|
||||
; GFX9-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1]
|
||||
; GFX9-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1]
|
||||
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1]
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1]
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1]
|
||||
; GFX9-NEXT: s_nop 0
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX9-NEXT: global_store_short v[2:3], v0, off
|
||||
; GFX9-NEXT: global_store_short v[4:5], v1, off
|
||||
;
|
||||
; GFX9-TGS-LABEL: test_workgroup:
|
||||
; GFX9-TGS: ; %bb.0:
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-TGS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX9-TGS-NEXT: global_load_ushort v14, v[0:1], off
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr4_vgpr5
|
||||
; GFX9-TGS-NEXT: global_load_ushort v15, v[4:5], off
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2
|
||||
; GFX9-TGS-NEXT: v_add_u32_e32 v0, 1, v2
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr7
|
||||
; GFX9-TGS-NEXT: v_mul_hi_u32 v1, v0, v7
|
||||
; GFX9-TGS-NEXT: v_sub_u32_e32 v2, v0, v1
|
||||
; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v2, 1, v2
|
||||
; GFX9-TGS-NEXT: v_add_u32_e32 v1, v2, v1
|
||||
; GFX9-TGS-NEXT: v_lshrrev_b32_e32 v1, 14, v1
|
||||
; GFX9-TGS-NEXT: v_mul_u32_u24_e32 v1, 0x14c5d7, v1
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr6
|
||||
; GFX9-TGS-NEXT: v_add3_u32 v2, v6, v1, v0
|
||||
; GFX9-TGS-NEXT: v_lshlrev_b64 v[4:5], 1, v[2:3]
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[4:5]
|
||||
; GFX9-TGS-NEXT: global_load_ushort v18, v[0:1], off
|
||||
; GFX9-TGS-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[4:5]
|
||||
; GFX9-TGS-NEXT: global_load_ushort v17, v[0:1], off
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr8_vgpr9
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr10_vgpr11
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $sgpr0_sgpr1
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr12_vgpr13
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr6_vgpr7
|
||||
; GFX9-TGS-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX9-TGS-NEXT: v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5]
|
||||
; GFX9-TGS-NEXT: v_lshl_add_u64 v[2:3], s[4:5], 0, v[2:3]
|
||||
; GFX9-TGS-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v14, v14
|
||||
; GFX9-TGS-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v16, v15
|
||||
; GFX9-TGS-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v15, v18
|
||||
; GFX9-TGS-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-TGS-NEXT: v_cvt_f32_f16_e32 v17, v17
|
||||
; GFX9-TGS-NEXT: v_pk_fma_f32 v[10:11], v[14:15], s[0:1], v[10:11] op_sel_hi:[1,0,1]
|
||||
; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[16:17], v[0:1] neg_lo:[0,1] neg_hi:[0,1]
|
||||
; GFX9-TGS-NEXT: s_nop 0
|
||||
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[8:9], v[0:1]
|
||||
; GFX9-TGS-NEXT: s_nop 0
|
||||
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[12:13], v[0:1]
|
||||
; GFX9-TGS-NEXT: s_barrier
|
||||
; GFX9-TGS-NEXT: v_pk_add_f32 v[0:1], v[10:11], v[0:1]
|
||||
; GFX9-TGS-NEXT: buffer_inv sc0
|
||||
; GFX9-TGS-NEXT: v_pk_mul_f32 v[0:1], v[6:7], v[0:1]
|
||||
; GFX9-TGS-NEXT: s_nop 0
|
||||
; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GFX9-TGS-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GFX9-TGS-NEXT: global_store_short v[2:3], v0, off
|
||||
; GFX9-TGS-NEXT: global_store_short v[4:5], v1, off
|
||||
fence syncscope("workgroup") acq_rel
|
||||
fence syncscope("wavefront") acq_rel
|
||||
ret void
|
||||
}
|
||||
...
|
||||
|
||||
---
|
||||
name: test_workgroup
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
%0:sgpr_256 = IMPLICIT_DEF
|
||||
%1:vgpr_32 = IMPLICIT_DEF
|
||||
%2:vreg_96_align2 = IMPLICIT_DEF
|
||||
%3:vgpr_32 = IMPLICIT_DEF
|
||||
%4:vreg_64_align2 = IMPLICIT_DEF
|
||||
%5:vreg_64_align2 = IMPLICIT_DEF
|
||||
%6:vreg_64_align2 = IMPLICIT_DEF
|
||||
%7:sgpr_64 = IMPLICIT_DEF
|
||||
%8:vreg_64_align2 = IMPLICIT_DEF
|
||||
%9:vreg_64_align2 = IMPLICIT_DEF
|
||||
%10:vreg_64_align2 = IMPLICIT_DEF
|
||||
%11:vreg_64_align2 = IMPLICIT_DEF
|
||||
%12:vgpr_32 = IMPLICIT_DEF
|
||||
%13:vgpr_32 = GLOBAL_LOAD_USHORT %5:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
|
||||
%14:vreg_64_align2 = IMPLICIT_DEF
|
||||
%15:vgpr_32 = GLOBAL_LOAD_USHORT %14:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
|
||||
%16:vgpr_32 = V_ADD_U32_e32 1, %12:vgpr_32, implicit $exec
|
||||
%17:vgpr_32 = V_MUL_HI_U32_e64 %16:vgpr_32, %3:vgpr_32, implicit $exec
|
||||
%18:vgpr_32 = V_SUB_U32_e32 %16:vgpr_32, %17:vgpr_32, implicit $exec
|
||||
%19:vgpr_32 = V_LSHRREV_B32_e32 1, %18:vgpr_32, implicit $exec
|
||||
%20:vgpr_32 = V_ADD_U32_e32 %19:vgpr_32, %17:vgpr_32, implicit $exec
|
||||
%21:vgpr_32 = V_LSHRREV_B32_e32 14, %20:vgpr_32, implicit $exec
|
||||
%22:vgpr_32 = V_MUL_U32_U24_e32 1361367, %21:vgpr_32, implicit $exec
|
||||
%6.sub0:vreg_64_align2 = V_ADD3_U32_e64 %1:vgpr_32, %22:vgpr_32, %16:vgpr_32, implicit $exec
|
||||
%23:vreg_64_align2 = nuw nsw V_LSHLREV_B64_e64 1, %6:vreg_64_align2, implicit $exec
|
||||
%24:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub2_sub3:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
|
||||
%25:vgpr_32 = GLOBAL_LOAD_USHORT %24:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
|
||||
%26:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub0_sub1:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
|
||||
%27:vgpr_32 = GLOBAL_LOAD_USHORT %26:vreg_64_align2, 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s16), addrspace 1)
|
||||
undef %28.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %13:vgpr_32, implicit $mode, implicit $exec
|
||||
%28.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %25:vgpr_32, implicit $mode, implicit $exec
|
||||
undef %29.sub0:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %15:vgpr_32, implicit $mode, implicit $exec
|
||||
%29.sub1:vreg_64_align2 = nofpexcept V_CVT_F32_F16_e32 %27:vgpr_32, implicit $mode, implicit $exec
|
||||
%30:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %29:vreg_64_align2, 11, %2.sub0_sub1:vreg_96_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%31:vreg_64_align2 = nofpexcept V_PK_FMA_F32 8, %28:vreg_64_align2, 0, %7:sgpr_64, 8, %10:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%32:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %9:vreg_64_align2, 8, %30:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%33:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %11:vreg_64_align2, 8, %32:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%34:vreg_64_align2 = nofpexcept V_PK_ADD_F32 8, %31:vreg_64_align2, 8, %33:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
%35:vreg_64_align2 = nofpexcept V_PK_MUL_F32 8, %8:vreg_64_align2, 8, %34:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
|
||||
ATOMIC_FENCE 5, 2
|
||||
S_BARRIER
|
||||
ATOMIC_FENCE 4, 2
|
||||
%36:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub0:vreg_64_align2, implicit $mode, implicit $exec
|
||||
%37:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %4:vreg_64_align2, implicit $exec
|
||||
GLOBAL_STORE_SHORT %37:vreg_64_align2, %36:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1)
|
||||
%38:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 %35.sub1:vreg_64_align2, implicit $mode, implicit $exec
|
||||
%39:vreg_64_align2 = V_LSHL_ADD_U64_e64 %0.sub4_sub5:sgpr_256, 0, %23:vreg_64_align2, implicit $exec
|
||||
GLOBAL_STORE_SHORT %39:vreg_64_align2, %38:vgpr_32, 0, 0, implicit $exec :: (store (s16), addrspace 1)
|
||||
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user