On GFX9, the instruction sequencer fetches 32 bytes at a time. When an 8-byte instruction at a loop header straddles a 32-byte fetch window boundary, the sequencer must perform two fetches after a backward branch, incurring a delay. On GFX950, this causes additional performance issues. This patch adds 32-byte alignment (.p2align 5, , 4) for loop headers on GFX950 when the first real instruction is 8 bytes. At most one s_nop (4 bytes, 1 quad-cycle before the loop) is used for padding. If more than 4 bytes of padding were needed, the 8-byte instruction would not straddle a 32-byte boundary anyway, so alignment is skipped. Note: the alignment decision is made during block-placement, before si-insert-waitcnts. In loops where a 4-byte S_WAITCNT is later inserted as the first instruction, the alignment becomes redundant but mostly harmless (at most one extra s_nop per affected loop). Assisted-by: Claude (Anthropic)
75 lines
2.6 KiB
YAML
75 lines
2.6 KiB
YAML
# RUN: llc -mtriple=amdgcn -mcpu=gfx950 -start-before=block-placement -o - %s | FileCheck %s
|
|
|
|
# Test that loop headers are aligned to 32 bytes on GFX950 when the first
|
|
# instruction is 8 bytes, to prevent the instruction from being split by the
|
|
# 32-byte fetch window boundary.
|
|
# The second test case verifies that 4-byte instructions do NOT trigger
|
|
# alignment (CHECK-NEXT chain would break if .p2align were inserted).
|
|
|
|
---
|
|
name: loop_with_8byte_first_inst
|
|
tracksRegLiveness: true
|
|
body: |
|
|
; CHECK-LABEL: loop_with_8byte_first_inst:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b64 s[0:1], 0
|
|
; CHECK-NEXT: .p2align 5, , 4
|
|
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: v_lshrrev_b64 v[0:1], 1, v[0:1]
|
|
bb.0:
|
|
successors: %bb.1(0x80000000)
|
|
liveins: $vgpr0_vgpr1
|
|
|
|
renamable $sgpr0_sgpr1 = S_MOV_B64 0
|
|
|
|
bb.1:
|
|
successors: %bb.2(0x04000000), %bb.1(0x7c000000)
|
|
liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
|
|
|
|
renamable $vgpr0_vgpr1 = V_LSHRREV_B64_e64 1, killed renamable $vgpr0_vgpr1, implicit $exec
|
|
V_CMP_EQ_U64_e32 0, $vgpr0_vgpr1, implicit-def $vcc, implicit $exec
|
|
renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
|
|
$exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
|
|
S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
|
|
|
bb.2:
|
|
liveins: $sgpr0_sgpr1
|
|
|
|
$exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
|
|
S_SETPC_B64_return undef $sgpr30_sgpr31
|
|
...
|
|
|
|
---
|
|
name: loop_with_4byte_first_inst
|
|
tracksRegLiveness: true
|
|
body: |
|
|
; CHECK-LABEL: loop_with_4byte_first_inst:
|
|
; CHECK: ; %bb.0:
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: s_mov_b64 s[0:1], 0
|
|
; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: v_add_u32_e32 v0, 1, v0
|
|
bb.0:
|
|
successors: %bb.1(0x80000000)
|
|
liveins: $vgpr0
|
|
|
|
renamable $sgpr0_sgpr1 = S_MOV_B64 0
|
|
|
|
bb.1:
|
|
successors: %bb.2(0x04000000), %bb.1(0x7c000000)
|
|
liveins: $sgpr0_sgpr1, $vgpr0
|
|
|
|
renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr0, implicit $exec
|
|
V_CMP_LT_U32_e32 10, $vgpr0, implicit-def $vcc, implicit $exec
|
|
renamable $sgpr0_sgpr1 = S_OR_B64 killed renamable $vcc, killed renamable $sgpr0_sgpr1, implicit-def $scc
|
|
$exec = S_ANDN2_B64 $exec, renamable $sgpr0_sgpr1, implicit-def $scc
|
|
S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
|
|
|
bb.2:
|
|
liveins: $sgpr0_sgpr1
|
|
|
|
$exec = S_OR_B64 $exec, killed renamable $sgpr0_sgpr1, implicit-def $scc
|
|
S_SETPC_B64_return undef $sgpr30_sgpr31
|
|
...
|