llvm-project/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
Austin Kerbow 2e5c298281
[AMDGPU] Add backward compatibility layer for kernarg preloading (#119167)
Add a prologue to the kernel entry to handle cases where code designed
for kernarg preloading is executed on hardware equipped with
incompatible firmware. If hardware has compatible firmware the 256 bytes
at the start of the kernel entry will be skipped. This skipping is done
automatically by hardware that supports the feature.

A pass is added which is intended to be run at the very end of the
pipeline to avoid any optimizations that would assume the prologue is a
real predecessor block to the actual code start. In reality we have two
possible entry points for the function. 1. The optimized path that
supports kernarg preloading which begins at an offset of 256 bytes. 2.
The backwards compatible entry point which starts at offset 0.
2025-01-10 11:39:02 -08:00

53 lines
2.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -asm-verbose=0 < %s | FileCheck -check-prefixes=ASM %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx942 --disassemble - | FileCheck -check-prefixes=OBJ %s
; OBJ: preload_ptr_kernarg_header
; OBJ-COUNT-60: s_nop 0
define amdgpu_kernel void @preload_ptr_kernarg_header(ptr inreg %arg) {
; ASM-LABEL: preload_ptr_kernarg_header:
; ASM: s_load_dwordx2 s[8:9], s[4:5], 0x0
; ASM-NEXT: s_waitcnt lgkmcnt(0)
; ASM-NEXT: s_branch .LBB0_0
; ASM-NEXT: .p2align 8
; ASM-NEXT: .LBB0_0:
; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; ASM-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; ASM-NEXT: s_endpgm
store ptr %arg, ptr %arg
ret void
}
; OBJ: preload_i32_kernarg_header
; OBJ-COUNT-58: s_nop 0
define amdgpu_kernel void @preload_i32_kernarg_header(ptr inreg %arg, i32 inreg %arg1) {
; ASM-LABEL: preload_i32_kernarg_header:
; ASM: s_load_dwordx2 s[8:9], s[4:5], 0x0
; ASM-NEXT: s_load_dword s10, s[4:5], 0x8
; ASM-NEXT: s_waitcnt lgkmcnt(0)
; ASM-NEXT: s_branch .LBB1_0
; ASM-NEXT: .p2align 8
; ASM-NEXT: .LBB1_0:
; ASM-NEXT: v_mov_b64_e32 v[0:1], s[8:9]
; ASM-NEXT: v_mov_b32_e32 v2, s10
; ASM-NEXT: flat_store_dword v[0:1], v2
; ASM-NEXT: s_endpgm
store i32 %arg1, ptr %arg
ret void
}
; OBJ: non_kernel_function
; ASM: non_kernel_function
; OBJ-NOT: s_branch
; ASM-NOT: s_branch
define void @non_kernel_function(ptr %arg) {
; ASM-LABEL: non_kernel_function:
; ASM: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; ASM-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; ASM-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; ASM-NEXT: s_setpc_b64 s[30:31]
store ptr %arg, ptr %arg
ret void
}