
This PR updates the SGPR layout to a striped caller/callee-saved design, similar to the VGPR layout. To ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer), and s34 (base pointer) remain callee-saved, the striped layout starts from s40, with a stripe width of 8. The last stripe is 10 wide instead of 8 to avoid ending with a 2-wide stripe. Fixes #113782.
195 lines
6.5 KiB
TableGen
195 lines
6.5 KiB
TableGen
//===---- AMDCallingConv.td - Calling Conventions for Radeon GPUs ---------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This describes the calling conventions for the AMD Radeon GPUs.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Inversion of CCIfInReg
|
|
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
|
|
class CCIfExtend<CCAction A>
|
|
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
|
|
|
|
// Calling convention for SI
|
|
def CC_SI_Gfx : CallingConv<[
|
|
// 0-3 are reserved for the stack buffer descriptor
|
|
// 30-31 are reserved for the return address
|
|
// 32 is reserved for the stack pointer
|
|
// 33 is reserved for the frame pointer
|
|
// 34 is reserved for the base pointer
|
|
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(4, 30), !cast<Register>("SGPR"#i)) // SGPR4-29
|
|
>>>,
|
|
|
|
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 32), !cast<Register>("VGPR"#i)) // VGPR0-31
|
|
>>>,
|
|
|
|
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
|
|
]>;
|
|
|
|
def RetCC_SI_Gfx : CallingConv<[
|
|
CCIfType<[i1], CCPromoteToType<i32>>,
|
|
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
|
|
|
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
|
|
>>>,
|
|
]>;
|
|
|
|
def CC_SI_SHADER : CallingConv<[
|
|
|
|
CCIfType<[i1], CCPromoteToType<i32>>,
|
|
|
|
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43
|
|
>>>,
|
|
|
|
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
|
|
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
|
|
>>>
|
|
]>;
|
|
|
|
def RetCC_SI_Shader : CallingConv<[
|
|
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
|
CCIfType<[i32, i16, v2i16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43
|
|
>>,
|
|
|
|
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
|
|
CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
|
|
>>
|
|
]>;
|
|
|
|
def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
|
|
// The CSRs & scratch-registers are interleaved at a split boundary of 8.
|
|
(add (sequence "VGPR%u", 40, 47),
|
|
(sequence "VGPR%u", 56, 63),
|
|
(sequence "VGPR%u", 72, 79),
|
|
(sequence "VGPR%u", 88, 95),
|
|
(sequence "VGPR%u", 104, 111),
|
|
(sequence "VGPR%u", 120, 127),
|
|
(sequence "VGPR%u", 136, 143),
|
|
(sequence "VGPR%u", 152, 159),
|
|
(sequence "VGPR%u", 168, 175),
|
|
(sequence "VGPR%u", 184, 191),
|
|
(sequence "VGPR%u", 200, 207),
|
|
(sequence "VGPR%u", 216, 223),
|
|
(sequence "VGPR%u", 232, 239),
|
|
(sequence "VGPR%u", 248, 255))
|
|
>;
|
|
|
|
def CSR_AMDGPU_AGPRs : CalleeSavedRegs<
|
|
(sequence "AGPR%u", 32, 255)
|
|
>;
|
|
|
|
def CSR_AMDGPU_SGPRs : CalleeSavedRegs<
|
|
// Ensure that s30-s31 (return address), s32 (stack pointer), s33 (frame pointer),
|
|
// and s34 (base pointer) are callee-saved. The striped layout starts from s40,
|
|
// with a stripe width of 8. The last stripe is 10 wide instead of 8, to avoid
|
|
// ending with a 2-wide stripe.
|
|
(add (sequence "SGPR%u", 30, 39),
|
|
(sequence "SGPR%u", 48, 55),
|
|
(sequence "SGPR%u", 64, 71),
|
|
(sequence "SGPR%u", 80, 87),
|
|
(sequence "SGPR%u", 96, 105))
|
|
>;
|
|
|
|
def CSR_AMDGPU_SI_Gfx_SGPRs : CalleeSavedRegs<
|
|
(add (sequence "SGPR%u", 4, 31), (sequence "SGPR%u", 64, 105))
|
|
>;
|
|
|
|
def CSR_AMDGPU : CalleeSavedRegs<
|
|
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs)
|
|
>;
|
|
|
|
def CSR_AMDGPU_GFX90AInsts : CalleeSavedRegs<
|
|
(add CSR_AMDGPU, CSR_AMDGPU_AGPRs)
|
|
>;
|
|
|
|
def CSR_AMDGPU_SI_Gfx : CalleeSavedRegs<
|
|
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SI_Gfx_SGPRs)
|
|
>;
|
|
|
|
def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs<
|
|
(add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs)
|
|
>;
|
|
|
|
def CSR_AMDGPU_CS_ChainPreserve : CalleeSavedRegs<
|
|
(sequence "VGPR%u", 8, 255)
|
|
>;
|
|
|
|
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
|
|
|
|
// Calling convention for leaf functions
|
|
def CC_AMDGPU_Func : CallingConv<[
|
|
CCIfByVal<CCPassByVal<4, 4>>,
|
|
CCIfType<[i1], CCPromoteToType<i32>>,
|
|
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
|
|
|
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
|
|
>>>,
|
|
|
|
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1, bf16, v2bf16], CCAssignToReg<
|
|
!foreach(i, !range(0, 32), !cast<Register>("VGPR"#i)) // VGPR0-31
|
|
>>,
|
|
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1, bf16, v2bf16], CCAssignToStack<4, 4>>
|
|
]>;
|
|
|
|
// Calling convention for leaf functions
|
|
def RetCC_AMDGPU_Func : CallingConv<[
|
|
CCIfType<[i1], CCPromoteToType<i32>>,
|
|
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
|
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, bf16, v2bf16], CCAssignToReg<
|
|
!foreach(i, !range(0, 32), !cast<Register>("VGPR"#i)) // VGPR0-31
|
|
>>,
|
|
]>;
|
|
|
|
def CC_AMDGPU : CallingConv<[
|
|
CCIf<"State.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >= "
|
|
"AMDGPUSubtarget::SOUTHERN_ISLANDS",
|
|
CCDelegateTo<CC_SI_SHADER>>,
|
|
CCIf<"State.getMachineFunction().getSubtarget<GCNSubtarget>().getGeneration() >= "
|
|
"AMDGPUSubtarget::SOUTHERN_ISLANDS && State.getCallingConv() == CallingConv::C",
|
|
CCDelegateTo<CC_AMDGPU_Func>>
|
|
]>;
|
|
|
|
def CC_AMDGPU_CS_CHAIN : CallingConv<[
|
|
CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(105), !cast<Register>("SGPR"#i))
|
|
>>>,
|
|
|
|
CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16, bf16, v2bf16] , CCAssignToReg<
|
|
!foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))
|
|
>>>
|
|
]>;
|
|
|
|
// Trivial class to denote when a def is used only to get a RegMask, i.e.
|
|
// SaveList is ignored and the def is not used as part of any calling
|
|
// convention.
|
|
class RegMask<dag mask> : CalleeSavedRegs<mask>;
|
|
|
|
def AMDGPU_AllVGPRs : RegMask<
|
|
(sequence "VGPR%u", 0, 255)
|
|
>;
|
|
|
|
def AMDGPU_AllAGPRs : RegMask<
|
|
(sequence "AGPR%u", 0, 255)
|
|
>;
|
|
|
|
def AMDGPU_AllVectorRegs : RegMask<
|
|
(add AMDGPU_AllVGPRs, AMDGPU_AllAGPRs)
|
|
>;
|
|
|
|
def AMDGPU_AllAllocatableSRegs : RegMask<
|
|
(add (sequence "SGPR%u", 0, 105), VCC_LO, VCC_HI)
|
|
>;
|