Diana Picus 20d8398825
[AMDGPU] ISel & PEI for whole wave functions (#145858)
Whole wave functions are functions that will run with a full EXEC mask.
They will not be invoked directly, but instead will be launched by way
of a new intrinsic, `llvm.amdgcn.call.whole.wave` (to be added in
a future patch). These functions are meant as an alternative to the
`llvm.amdgcn.init.whole.wave` or `llvm.amdgcn.strict.wwm` intrinsics.

Whole wave functions will set EXEC to -1 in the prologue and restore the
original value of EXEC in the epilogue. They must have a special first
argument, `i1 %active`, that is going to be mapped to EXEC. They may
have either the default calling convention or amdgpu_gfx. The inactive
lanes need to be preserved for all registers used, active lanes only for
the CSRs.

At the IR level, arguments to a whole wave function (other than
`%active`) contain poison in their inactive lanes. Likewise, the return
value for the inactive lanes is poison.

This patch contains the following work:
* 2 new pseudos, SI_SETUP_WHOLE_WAVE_FUNC and SI_WHOLE_WAVE_FUNC_RETURN
  used for managing the EXEC mask. SI_SETUP_WHOLE_WAVE_FUNC will return
  a SReg_1 representing `%active`, which needs to be passed into
  SI_WHOLE_WAVE_FUNC_RETURN.
* SelectionDAG support for generating these 2 new pseudos and the
  special handling of %active. Since the return may be in a different
  basic block, it's difficult to add the virtual reg for %active to
  SI_WHOLE_WAVE_FUNC_RETURN, so we initially generate an IMPLICIT_DEF
  which is later replaced via a custom inserter.
* Expansion of the 2 pseudos during prolog/epilog insertion. PEI also
  marks any used VGPRs as WWM registers, which are then spilled and
  restored with the usual logic.

Future patches will include the `llvm.amdgcn.call.whole.wave` intrinsic
and a lot of optimization work (especially in order to reduce spills
around function calls).

---------

Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
Co-authored-by: Shilei Tian <i@tianshilei.me>
2025-07-21 10:39:09 +02:00

253 lines
7.9 KiB
LLVM

; RUN: not llvm-as < %s 2>&1 | FileCheck %s
target datalayout = "A5"
; CHECK: Calling convention requires void return type
; CHECK-NEXT: ptr @nonvoid_cc_amdgpu_kernel
define amdgpu_kernel i32 @nonvoid_cc_amdgpu_kernel() {
ret i32 0
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_kernel
define amdgpu_kernel void @varargs_amdgpu_kernel(...) {
ret void
}
; CHECK: Calling convention does not allow sret
; CHECK-NEXT: ptr @sret_cc_amdgpu_kernel_as0
define amdgpu_kernel void @sret_cc_amdgpu_kernel_as0(ptr sret(i32) %ptr) {
ret void
}
; CHECK: Calling convention does not allow sret
; CHECK-NEXT: ptr @sret_cc_amdgpu_kernel
define amdgpu_kernel void @sret_cc_amdgpu_kernel(ptr addrspace(5) sret(i32) %ptr) {
ret void
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_vs
define amdgpu_vs void @varargs_amdgpu_vs(...) {
ret void
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_gs
define amdgpu_gs void @varargs_amdgpu_gs(...) {
ret void
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_ps
define amdgpu_ps void @varargs_amdgpu_ps(...) {
ret void
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_cs
define amdgpu_cs void @varargs_amdgpu_cs(...) {
ret void
}
; CHECK: Calling convention requires void return type
; CHECK-NEXT: ptr @nonvoid_cc_spir_kernel
define spir_kernel i32 @nonvoid_cc_spir_kernel() {
ret i32 0
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_spir_kernel
define spir_kernel void @varargs_spir_kernel(...) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_kernel
define amdgpu_kernel void @byval_cc_amdgpu_kernel(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_as1_cc_amdgpu_kernel
define amdgpu_kernel void @byval_as1_cc_amdgpu_kernel(ptr addrspace(1) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_as0_cc_amdgpu_kernel
define amdgpu_kernel void @byval_as0_cc_amdgpu_kernel(ptr byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_vs
define amdgpu_vs void @byval_cc_amdgpu_vs(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_hs
define amdgpu_hs void @byval_cc_amdgpu_hs(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_gs
define amdgpu_gs void @byval_cc_amdgpu_gs(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_ps
define amdgpu_ps void @byval_cc_amdgpu_ps(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_cs
define amdgpu_cs void @byval_cc_amdgpu_cs(ptr addrspace(5) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows preallocated
; CHECK-NEXT: ptr @preallocated_as0_cc_amdgpu_kernel
define amdgpu_kernel void @preallocated_as0_cc_amdgpu_kernel(ptr preallocated(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows inalloca
; CHECK-NEXT: ptr @inalloca_as0_cc_amdgpu_kernel
define amdgpu_kernel void @inalloca_as0_cc_amdgpu_kernel(ptr inalloca(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows stack byref
; CHECK-NEXT: ptr @byref_as5_cc_amdgpu_kernel
define amdgpu_kernel void @byref_as5_cc_amdgpu_kernel(ptr addrspace(5) byref(i32) %ptr) {
ret void
}
; CHECK: Calling convention requires void return type
; CHECK-NEXT: ptr @nonvoid_cc_amdgpu_cs_chain
define amdgpu_cs_chain i32 @nonvoid_cc_amdgpu_cs_chain() {
ret i32 0
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_cs_chain
define amdgpu_cs_chain void @varargs_amdgpu_cs_chain(...) {
ret void
}
; CHECK: Calling convention does not allow sret
; CHECK-NEXT: ptr @sret_cc_amdgpu_cs_chain_as0
define amdgpu_cs_chain void @sret_cc_amdgpu_cs_chain_as0(ptr sret(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_cs_chain
define amdgpu_cs_chain void @byval_cc_amdgpu_cs_chain(ptr addrspace(1) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows stack byref
; CHECK-NEXT: ptr @byref_cc_amdgpu_cs_chain
define amdgpu_cs_chain void @byref_cc_amdgpu_cs_chain(ptr addrspace(5) byref(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows preallocated
; CHECK-NEXT: ptr @preallocated_cc_amdgpu_cs_chain
define amdgpu_cs_chain void @preallocated_cc_amdgpu_cs_chain(ptr preallocated(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows inalloca
; CHECK-NEXT: ptr @inalloca_cc_amdgpu_cs_chain
define amdgpu_cs_chain void @inalloca_cc_amdgpu_cs_chain(ptr inalloca(i32) %ptr) {
ret void
}
; CHECK: Calling convention requires void return type
; CHECK-NEXT: ptr @nonvoid_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve i32 @nonvoid_cc_amdgpu_cs_chain_preserve() {
ret i32 0
}
; CHECK: Calling convention does not support varargs or perfect forwarding!
; CHECK-NEXT: ptr @varargs_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @varargs_amdgpu_cs_chain_preserve(...) {
ret void
}
; CHECK: Calling convention does not allow sret
; CHECK-NEXT: ptr @sret_cc_amdgpu_cs_chain_preserve_as0
define amdgpu_cs_chain_preserve void @sret_cc_amdgpu_cs_chain_preserve_as0(ptr sret(i32) %ptr) {
ret void
}
; CHECK: Calling convention does not allow sret
; CHECK-NEXT: ptr @sret_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @sret_cc_amdgpu_cs_chain_preserve(ptr addrspace(5) sret(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows byval
; CHECK-NEXT: ptr @byval_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @byval_cc_amdgpu_cs_chain_preserve(ptr addrspace(1) byval(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows stack byref
; CHECK-NEXT: ptr @byref_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @byref_cc_amdgpu_cs_chain_preserve(ptr addrspace(5) byref(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows preallocated
; CHECK-NEXT: ptr @preallocated_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @preallocated_cc_amdgpu_cs_chain_preserve(ptr preallocated(i32) %ptr) {
ret void
}
; CHECK: Calling convention disallows inalloca
; CHECK-NEXT: ptr @inalloca_cc_amdgpu_cs_chain_preserve
define amdgpu_cs_chain_preserve void @inalloca_cc_amdgpu_cs_chain_preserve(ptr inalloca(i32) %ptr) {
ret void
}
; CHECK: Calling convention requires first argument to be i1
; CHECK-NEXT: ptr @whole_wave_no_args
define amdgpu_gfx_whole_wave void @whole_wave_no_args() {
ret void
}
; CHECK: Calling convention requires first argument to be i1
; CHECK-NEXT: ptr @whole_wave_must_have_i1_active
define amdgpu_gfx_whole_wave void @whole_wave_must_have_i1_active(i32 %x) {
ret void
}
; CHECK: Calling convention requires first argument to not be inreg
; CHECK-NEXT: ptr @whole_wave_i1_active_inreg
define amdgpu_gfx_whole_wave void @whole_wave_i1_active_inreg(i1 inreg %active) {
ret void
}
; CHECK: Calling convention does not support varargs
; CHECK-NEXT: ptr @whole_wave_varargs
define amdgpu_gfx_whole_wave void @whole_wave_varargs(i1 %active, i32 %x, ...) {
ret void
}
declare amdgpu_gfx_whole_wave void @whole_wave_callee(i1 %active)
; CHECK: calling convention does not permit calls
; CHECK-NEXT: call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
define amdgpu_cs void @cant_call_whole_wave_func() {
call amdgpu_gfx_whole_wave void @whole_wave_callee(i1 true)
ret void
}