[AMDGPU] Add intrinsic exposing s_alloc_vgpr (#163951)
Make it possible to use `s_alloc_vgpr` at the IR level. This is a huge footgun and use for anything other than compiler internal purposes is heavily discouraged. The calling code must make sure that it does not allocate fewer VGPRs than necessary - the intrinsic is NOT a request to the backend to limit the number of VGPRs it uses (in essence it's not so different from what we do with the dynamic VGPR flags of the `amdgcn.cs.chain` intrinsic, it just makes it possible to use this functionality in other scenarios).
This commit is contained in:
parent
7f1907cea0
commit
24405f070f
@ -397,6 +397,19 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic;
|
||||
def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic;
|
||||
def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic;
|
||||
|
||||
// Request the hardware to allocate the given number of VGPRs. The actual number
|
||||
// of allocated VGPRs may be rounded up to match hardware block boundaries.
|
||||
// It is the responsibility of the calling code to ensure it does not allocate
|
||||
// below the VGPR requirements of the current shader. This intrinsic is only
|
||||
// available on targets that support dynamic VGPR mode.
|
||||
def int_amdgcn_s_alloc_vgpr : DefaultAttrsIntrinsic<
|
||||
[llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise.
|
||||
[llvm_i32_ty], // The number of VGPRs to allocate.
|
||||
[NoUndef<RetIndex>, NoUndef<ArgIndex<0>>,
|
||||
IntrNoMem, IntrHasSideEffects, IntrConvergent
|
||||
]
|
||||
>;
|
||||
|
||||
def int_amdgcn_div_scale : PureIntrinsic<
|
||||
// 1st parameter: Numerator
|
||||
// 2nd parameter: Denominator
|
||||
|
||||
@ -2401,6 +2401,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
|
||||
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
|
||||
case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn:
|
||||
return selectDSBvhStackIntrinsic(I);
|
||||
case Intrinsic::amdgcn_s_alloc_vgpr: {
|
||||
// S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets
|
||||
// SCC. We then need to COPY it into the result vreg.
|
||||
MachineBasicBlock *MBB = I.getParent();
|
||||
const DebugLoc &DL = I.getDebugLoc();
|
||||
|
||||
Register ResReg = I.getOperand(0).getReg();
|
||||
|
||||
MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR))
|
||||
.add(I.getOperand(2));
|
||||
(void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg)
|
||||
.addReg(AMDGPU::SCC);
|
||||
I.eraseFromParent();
|
||||
constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI);
|
||||
return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI);
|
||||
}
|
||||
case Intrinsic::amdgcn_s_barrier_init:
|
||||
case Intrinsic::amdgcn_s_barrier_signal_var:
|
||||
return selectNamedBarrierInit(I, IntrinsicID);
|
||||
|
||||
@ -3285,6 +3285,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
|
||||
constrainOpWithReadfirstlane(B, MI, 2); // M0
|
||||
return;
|
||||
}
|
||||
case Intrinsic::amdgcn_s_alloc_vgpr:
|
||||
constrainOpWithReadfirstlane(B, MI, 2);
|
||||
return;
|
||||
case Intrinsic::amdgcn_s_sendmsg:
|
||||
case Intrinsic::amdgcn_s_sendmsghalt: {
|
||||
// FIXME: Should this use a waterfall loop?
|
||||
@ -5386,6 +5389,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
|
||||
OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
|
||||
break;
|
||||
case Intrinsic::amdgcn_s_alloc_vgpr:
|
||||
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1);
|
||||
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
|
||||
break;
|
||||
case Intrinsic::amdgcn_s_sendmsg:
|
||||
case Intrinsic::amdgcn_s_sendmsghalt: {
|
||||
// This must be an SGPR, but accept a VGPR.
|
||||
|
||||
@ -407,6 +407,7 @@ def : AlwaysUniform<int_amdgcn_cluster_workgroup_max_flat_id>;
|
||||
def : AlwaysUniform<int_amdgcn_workgroup_id_x>;
|
||||
def : AlwaysUniform<int_amdgcn_workgroup_id_y>;
|
||||
def : AlwaysUniform<int_amdgcn_workgroup_id_z>;
|
||||
def : AlwaysUniform<int_amdgcn_s_alloc_vgpr>;
|
||||
def : AlwaysUniform<int_amdgcn_s_getpc>;
|
||||
def : AlwaysUniform<int_amdgcn_s_getreg>;
|
||||
def : AlwaysUniform<int_amdgcn_s_memrealtime>;
|
||||
|
||||
@ -11234,6 +11234,19 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
|
||||
Ops, M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_s_alloc_vgpr: {
|
||||
SDValue NumVGPRs = Op.getOperand(2);
|
||||
if (!NumVGPRs->isDivergent())
|
||||
return Op;
|
||||
|
||||
SDValue ReadFirstLaneID =
|
||||
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
|
||||
NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
|
||||
ReadFirstLaneID, NumVGPRs);
|
||||
|
||||
return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
|
||||
Op.getOperand(0), Op.getOperand(1), NumVGPRs);
|
||||
}
|
||||
case Intrinsic::amdgcn_s_get_barrier_state:
|
||||
case Intrinsic::amdgcn_s_get_named_barrier_state: {
|
||||
SDValue Chain = Op->getOperand(0);
|
||||
|
||||
@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in {
|
||||
} // End SubtargetPredicate = isGFX11Plus
|
||||
|
||||
let SubtargetPredicate = isGFX12Plus in {
|
||||
let hasSideEffects = 1, Defs = [SCC] in {
|
||||
def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">;
|
||||
let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in {
|
||||
def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr",
|
||||
[(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))]
|
||||
>;
|
||||
}
|
||||
} // End SubtargetPredicate = isGFX12Plus
|
||||
|
||||
|
||||
@ -183,6 +183,15 @@ define void @cluster_workgroup_max_flat_id(ptr addrspace(1) inreg %out) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: for function 's_alloc_vgpr':
|
||||
; CHECK: ALL VALUES UNIFORM
|
||||
define void @s_alloc_vgpr(i32 inreg %n, ptr addrspace(1) inreg %out) {
|
||||
%scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
|
||||
%sel = select i1 %scc, i32 1, i32 0
|
||||
store i32 %sel, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: for function 's_memtime':
|
||||
; CHECK: ALL VALUES UNIFORM
|
||||
define void @s_memtime(ptr addrspace(1) inreg %out) {
|
||||
|
||||
123
llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
Normal file
123
llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll
Normal file
@ -0,0 +1,123 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
|
||||
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GISEL
|
||||
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=DAGISEL
|
||||
|
||||
declare i1 @llvm.amdgcn.s.alloc.vgpr(i32)
|
||||
|
||||
define amdgpu_cs void @test_alloc_vreg_const(ptr addrspace(1) %out) #0 {
|
||||
; GISEL-LABEL: test_alloc_vreg_const:
|
||||
; GISEL: ; %bb.0: ; %entry
|
||||
; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GISEL-NEXT: s_alloc_vgpr 45
|
||||
; GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s0, s0, 1
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; GISEL-NEXT: s_alloc_vgpr 0
|
||||
; GISEL-NEXT: s_endpgm
|
||||
;
|
||||
; DAGISEL-LABEL: test_alloc_vreg_const:
|
||||
; DAGISEL: ; %bb.0: ; %entry
|
||||
; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
||||
; DAGISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; DAGISEL-NEXT: s_alloc_vgpr 45
|
||||
; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; DAGISEL-NEXT: s_alloc_vgpr 0
|
||||
; DAGISEL-NEXT: s_endpgm
|
||||
entry:
|
||||
%scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 45)
|
||||
%sel = select i1 %scc, i32 1, i32 0
|
||||
store i32 %sel, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_cs void @test_alloc_vreg_var(i32 inreg %n, ptr addrspace(1) %out) #0 {
|
||||
; GISEL-LABEL: test_alloc_vreg_var:
|
||||
; GISEL: ; %bb.0: ; %entry
|
||||
; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GISEL-NEXT: s_alloc_vgpr s0
|
||||
; GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GISEL-NEXT: s_and_b32 s0, s0, 1
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GISEL-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; GISEL-NEXT: s_alloc_vgpr 0
|
||||
; GISEL-NEXT: s_endpgm
|
||||
;
|
||||
; DAGISEL-LABEL: test_alloc_vreg_var:
|
||||
; DAGISEL: ; %bb.0: ; %entry
|
||||
; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
|
||||
; DAGISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; DAGISEL-NEXT: s_alloc_vgpr s0
|
||||
; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off
|
||||
; DAGISEL-NEXT: s_alloc_vgpr 0
|
||||
; DAGISEL-NEXT: s_endpgm
|
||||
entry:
|
||||
%scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
|
||||
%sel = select i1 %scc, i32 1, i32 0
|
||||
store i32 %sel, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_cs void @test_alloc_vreg_vgpr(i32 %n, ptr addrspace(1) %out) #0 {
|
||||
; GISEL-LABEL: test_alloc_vreg_vgpr:
|
||||
; GISEL: ; %bb.0: ; %entry
|
||||
; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; GISEL-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
|
||||
; GISEL-NEXT: s_alloc_vgpr s0
|
||||
; GISEL-NEXT: s_cselect_b32 s0, 1, 0
|
||||
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GISEL-NEXT: s_and_b32 s0, s0, 1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GISEL-NEXT: global_store_b32 v[4:5], v0, off
|
||||
; GISEL-NEXT: s_alloc_vgpr 0
|
||||
; GISEL-NEXT: s_endpgm
|
||||
;
|
||||
; DAGISEL-LABEL: test_alloc_vreg_vgpr:
|
||||
; DAGISEL: ; %bb.0: ; %entry
|
||||
; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2)
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; DAGISEL-NEXT: s_cmp_lg_u32 0, s33
|
||||
; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0
|
||||
; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
||||
; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; DAGISEL-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
|
||||
; DAGISEL-NEXT: s_alloc_vgpr s0
|
||||
; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; DAGISEL-NEXT: global_store_b32 v[2:3], v0, off
|
||||
; DAGISEL-NEXT: s_alloc_vgpr 0
|
||||
; DAGISEL-NEXT: s_endpgm
|
||||
entry:
|
||||
%scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n)
|
||||
%sel = select i1 %scc, i32 1, i32 0
|
||||
store i32 %sel, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }
|
||||
Loading…
x
Reference in New Issue
Block a user