michaelselehov cb3fbe921b
[AMDGPU] Set preferred function alignment based on icache geometry (#183064)
Non-entry functions were unconditionally aligned to 4 bytes with no
architecture-specific preferred alignment, and setAlignment() was used
instead of ensureAlignment(), overwriting any explicit IR attributes.

Add instruction cache line size and fetch alignment data to GCNSubtarget
for each generation (GFX9: 64B/32B, GFX10: 64B/4B, GFX11+: 128B/4B). Use
this to call setPrefFunctionAlignment() in SITargetLowering, aligning
non-entry functions to the cache line size by default. Change
setAlignment to ensureAlignment in AMDGPUAsmPrinter so explicit IR align
attributes are respected.

Empirical thread trace analysis on gfx942, gfx1030, gfx1100, and gfx1200
showed that only GFX9 exhibits measurable fetch stalls when functions
cross the 32-byte fetch window boundary. GFX10+ showed no alignment
sensitivity. A hidden option -amdgpu-align-functions-for-fetch-only is
provided to use the fetch granularity instead of cache line size.

Assisted-by: Claude Opus
2026-03-11 07:57:37 -04:00

73 lines
2.4 KiB
LLVM

; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA-CI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA-VI %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -filetype=obj | llvm-readobj --symbols -S --sd - | FileCheck --check-prefix=ELF %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | llvm-mc -filetype=obj -triple amdgcn--amdhsa --amdhsa-code-object-version=4 -mcpu=kaveri | llvm-readobj --symbols -S --sd - | FileCheck %s --check-prefix=ELF
; The SHT_NOTE section contains the output from the .hsa_code_object_*
; directives.
; ELF: Section {
; ELF: Name: .text
; ELF: Type: SHT_PROGBITS (0x1)
; ELF: Flags [ (0x6)
; ELF: SHF_ALLOC (0x2)
; ELF: SHF_EXECINSTR (0x4)
; ELF: AddressAlignment: 64
; ELF: }
; ELF: SHT_NOTE
; ELF: 0000: 07000000 4F000000 20000000 414D4447
; ELF: 0010: 50550000 83AE616D 64687361 2E6B6572
; ELF: 0020: 6E656C73 90AD616D 64687361 2E746172
; ELF: 0030: 676574BD 616D6467 636E2D75 6E6B6E6F
; ELF: 0040: 776E2D61 6D646873 612D2D67 66783730
; ELF: 0050: 30AE616D 64687361 2E766572 73696F6E
; ELF: 0060: 92010100
; ELF: Symbol {
; ELF: Name: simple
; ELF: Size: 36
; ELF: Type: Function (0x2)
; ELF: }
; HSA: .text
; HSA-CI: .amdgcn_target "amdgcn-unknown-amdhsa--gfx700"
; HSA-VI: .amdgcn_target "amdgcn-unknown-amdhsa--gfx801"
; HSA-NOT: .amdgpu_hsa_kernel simple
; HSA: .globl simple
; HSA: .p2align 6
; HSA: {{^}}simple:
; HSA-NOT: amd_kernel_code_t
; HSA: flat_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v[0:1]
; Make sure we are setting the ATC bit:
; Make sure we generate flat store for HSA
; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
; HSA: .Lfunc_end0:
; HSA: .size simple, .Lfunc_end0-simple
; HSA: ; Function info:
; HSA-NOT: COMPUTE_PGM_RSRC2
define void @simple(ptr addrspace(4) %ptr.out) {
entry:
%out = load ptr addrspace(1), ptr addrspace(4) %ptr.out
store i32 0, ptr addrspace(1) %out
ret void
}
; Ignore explicit alignment that is too low.
; HSA: .globl simple_align2
; HSA: .p2align 6
define void @simple_align2(ptr addrspace(4) %ptr.out) align 2 {
entry:
%out = load ptr addrspace(1), ptr addrspace(4) %ptr.out
store i32 0, ptr addrspace(1) %out
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}