diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 28bd6c3409e4..7a627119cc91 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -397,6 +397,19 @@ def int_amdgcn_s_wait_loadcnt : AMDGPUWaitIntrinsic; def int_amdgcn_s_wait_samplecnt : AMDGPUWaitIntrinsic; def int_amdgcn_s_wait_storecnt : AMDGPUWaitIntrinsic; +// Request the hardware to allocate the given number of VGPRs. The actual number +// of allocated VGPRs may be rounded up to match hardware block boundaries. +// It is the responsibility of the calling code to ensure it does not allocate +// below the VGPR requirements of the current shader. This intrinsic is only +// available on targets that support dynamic VGPR mode. +def int_amdgcn_s_alloc_vgpr : DefaultAttrsIntrinsic< + [llvm_i1_ty], // Returns true if the allocation succeeded, false otherwise. + [llvm_i32_ty], // The number of VGPRs to allocate. + [NoUndef, NoUndef>, + IntrNoMem, IntrHasSideEffects, IntrConvergent + ] +>; + def int_amdgcn_div_scale : PureIntrinsic< // 1st parameter: Numerator // 2nd parameter: Denominator diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b96c2ef70dd8..c1af146e2cdd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2401,6 +2401,22 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS( case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn: case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: return selectDSBvhStackIntrinsic(I); + case Intrinsic::amdgcn_s_alloc_vgpr: { + // S_ALLOC_VGPR doesn't have a destination register, it just implicitly sets + // SCC. We then need to COPY it into the result vreg. + MachineBasicBlock *MBB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + + Register ResReg = I.getOperand(0).getReg(); + + MachineInstr *AllocMI = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_ALLOC_VGPR)) + .add(I.getOperand(2)); + (void)BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), ResReg) + .addReg(AMDGPU::SCC); + I.eraseFromParent(); + constrainSelectedInstRegOperands(*AllocMI, TII, TRI, RBI); + return RBI.constrainGenericRegister(ResReg, AMDGPU::SReg_32RegClass, *MRI); + } case Intrinsic::amdgcn_s_barrier_init: case Intrinsic::amdgcn_s_barrier_signal_var: return selectNamedBarrierInit(I, IntrinsicID); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 04dbc0721ebf..4dd537bea396 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3285,6 +3285,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl( constrainOpWithReadfirstlane(B, MI, 2); // M0 return; } + case Intrinsic::amdgcn_s_alloc_vgpr: + constrainOpWithReadfirstlane(B, MI, 2); + return; case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // FIXME: Should this use a waterfall loop? @@ -5386,6 +5389,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32); OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI); break; + case Intrinsic::amdgcn_s_alloc_vgpr: + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32); + break; case Intrinsic::amdgcn_s_sendmsg: case Intrinsic::amdgcn_s_sendmsghalt: { // This must be an SGPR, but accept a VGPR. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 58a9b5511f2d..963bb91b6a7d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -407,6 +407,7 @@ def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; +def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; def : AlwaysUniform; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 44535f471b70..70d9420970a0 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11234,6 +11234,19 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(), Ops, M->getMemOperand()); } + case Intrinsic::amdgcn_s_alloc_vgpr: { + SDValue NumVGPRs = Op.getOperand(2); + if (!NumVGPRs->isDivergent()) + return Op; + + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); + NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + ReadFirstLaneID, NumVGPRs); + + return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(), + Op.getOperand(0), Op.getOperand(1), NumVGPRs); + } case Intrinsic::amdgcn_s_get_barrier_state: case Intrinsic::amdgcn_s_get_named_barrier_state: { SDValue Chain = Op->getOperand(0); diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2a13acb6b0ce..1b0862e039c6 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -433,8 +433,10 @@ let SubtargetPredicate = isGFX11Plus in { } // End SubtargetPredicate = isGFX11Plus let SubtargetPredicate = isGFX12Plus in { - let hasSideEffects = 1, Defs = [SCC] in { - def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr">; + let hasSideEffects = 1, isConvergent = 1, Defs = [SCC] in { + def S_ALLOC_VGPR : SOP1_0_32 <"s_alloc_vgpr", + [(set SCC, (int_amdgcn_s_alloc_vgpr SSrc_b32:$src0))] + >; } } // End SubtargetPredicate = isGFX12Plus diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll index 9ff670bee0f8..3f56f12f3cb3 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/always_uniform.ll @@ -183,6 +183,15 @@ define void @cluster_workgroup_max_flat_id(ptr addrspace(1) inreg %out) { ret void } +; CHECK-LABEL: for function 's_alloc_vgpr': +; CHECK: ALL VALUES UNIFORM +define void @s_alloc_vgpr(i32 inreg %n, ptr addrspace(1) inreg %out) { + %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n) + %sel = select i1 %scc, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + ; CHECK-LABEL: for function 's_memtime': ; CHECK: ALL VALUES UNIFORM define void @s_memtime(ptr addrspace(1) inreg %out) { diff --git a/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll new file mode 100644 index 000000000000..2a7f93f68b8f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/intrinsic-amdgcn-s-alloc-vgpr.ll @@ -0,0 +1,123 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=DAGISEL + +declare i1 @llvm.amdgcn.s.alloc.vgpr(i32) + +define amdgpu_cs void @test_alloc_vreg_const(ptr addrspace(1) %out) #0 { +; GISEL-LABEL: test_alloc_vreg_const: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_cmp_lg_u32 0, s33 +; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GISEL-NEXT: s_alloc_vgpr 45 +; GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GISEL-NEXT: s_and_b32 s0, s0, 1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_alloc_vgpr 0 +; GISEL-NEXT: s_endpgm +; +; DAGISEL-LABEL: test_alloc_vreg_const: +; DAGISEL: ; %bb.0: ; %entry +; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; DAGISEL-NEXT: s_cmp_lg_u32 0, s33 +; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; DAGISEL-NEXT: s_alloc_vgpr 45 +; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0 +; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; DAGISEL-NEXT: s_alloc_vgpr 0 +; DAGISEL-NEXT: s_endpgm +entry: + %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 45) + %sel = select i1 %scc, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_alloc_vreg_var(i32 inreg %n, ptr addrspace(1) %out) #0 { +; GISEL-LABEL: test_alloc_vreg_var: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_cmp_lg_u32 0, s33 +; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GISEL-NEXT: s_alloc_vgpr s0 +; GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GISEL-NEXT: s_and_b32 s0, s0, 1 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_alloc_vgpr 0 +; GISEL-NEXT: s_endpgm +; +; DAGISEL-LABEL: test_alloc_vreg_var: +; DAGISEL: ; %bb.0: ; %entry +; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; DAGISEL-NEXT: s_cmp_lg_u32 0, s33 +; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; DAGISEL-NEXT: s_alloc_vgpr s0 +; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0 +; DAGISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; DAGISEL-NEXT: global_store_b32 v[0:1], v2, off +; DAGISEL-NEXT: s_alloc_vgpr 0 +; DAGISEL-NEXT: s_endpgm +entry: + %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n) + %sel = select i1 %scc, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @test_alloc_vreg_vgpr(i32 %n, ptr addrspace(1) %out) #0 { +; GISEL-LABEL: test_alloc_vreg_vgpr: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_cmp_lg_u32 0, s33 +; GISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GISEL-NEXT: s_alloc_vgpr s0 +; GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GISEL-NEXT: s_and_b32 s0, s0, 1 +; GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GISEL-NEXT: global_store_b32 v[4:5], v0, off +; GISEL-NEXT: s_alloc_vgpr 0 +; GISEL-NEXT: s_endpgm +; +; DAGISEL-LABEL: test_alloc_vreg_vgpr: +; DAGISEL: ; %bb.0: ; %entry +; DAGISEL-NEXT: s_getreg_b32 s33, hwreg(HW_REG_WAVE_HW_ID2, 8, 2) +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: s_cmp_lg_u32 0, s33 +; DAGISEL-NEXT: s_cmovk_i32 s33, 0x1c0 +; DAGISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 +; DAGISEL-NEXT: v_readfirstlane_b32 s0, v0 +; DAGISEL-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; DAGISEL-NEXT: s_alloc_vgpr s0 +; DAGISEL-NEXT: s_cselect_b32 s0, -1, 0 +; DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; DAGISEL-NEXT: global_store_b32 v[2:3], v0, off +; DAGISEL-NEXT: s_alloc_vgpr 0 +; DAGISEL-NEXT: s_endpgm +entry: + %scc = call i1 @llvm.amdgcn.s.alloc.vgpr(i32 %n) + %sel = select i1 %scc, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +attributes #0 = { "amdgpu-dynamic-vgpr-block-size" = "16" }