[AMDGPU] Per-subtarget DPP instruction classification (#153096)

This is NFCI at this point.
This commit is contained in:
Stanislav Mekhanoshin 2025-08-11 15:41:02 -07:00 committed by GitHub
parent b9ecee9d47
commit ea14834966
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 120 additions and 34 deletions

View File

@ -5653,7 +5653,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
ST.hasDPALU_DPP() &&
AMDGPU::isLegalDPALU_DPPControl(MI.getOperand(4).getImm()))
AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
SplitSize = 64;
if (Size == SplitSize) {

View File

@ -5052,11 +5052,13 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
if (DppCtrlIdx >= 0) {
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
// DP ALU DPP is supported for row_newbcast only on GFX9*
if (!AMDGPU::isLegalDPALU_DPPControl(getSTI(), DppCtrl) &&
AMDGPU::isDPALU_DPP(MII.get(Opc), getSTI())) {
// DP ALU DPP is supported for row_newbcast only on GFX9* and row_share
// only on GFX12.
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
Error(S, "DP ALU dpp only supports row_newbcast");
Error(S, isGFX12() ? "DP ALU dpp only supports row_share"
: "DP ALU dpp only supports row_newbcast");
return false;
}
}

View File

@ -549,11 +549,17 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
return false;
}
if (MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
assert(DppCtrl && DppCtrl->isImm());
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
assert(DppCtrl && DppCtrl->isImm());
unsigned DppCtrlVal = DppCtrl->getImm();
if ((MovMI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp)) {
if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP)) {
LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move is unsupported\n");
// Split it.
return false;
}
if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal)) {
LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
" control value\n");
// Let it split, then control may become legal.
@ -709,6 +715,20 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
break;
}
if (!ST->hasFeature(AMDGPU::FeatureDPALU_DPP) &&
AMDGPU::isDPALU_DPP32BitOpc(OrigOp)) {
LLVM_DEBUG(dbgs() << " " << OrigMI
<< " failed: DPP ALU DPP is not supported\n");
break;
}
if (!AMDGPU::isLegalDPALU_DPPControl(*ST, DppCtrlVal) &&
AMDGPU::isDPALU_DPP(TII->get(OrigOp), *ST)) {
LLVM_DEBUG(dbgs() << " " << OrigMI
<< " failed: not valid 64-bit DPP control value\n");
break;
}
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
if (Use == Src0) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,

View File

@ -976,8 +976,10 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
unsigned Imm = MI->getOperand(OpNo).getImm();
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) {
O << " /* DP ALU dpp only supports row_newbcast */";
if (!AMDGPU::isLegalDPALU_DPPControl(STI, Imm) &&
AMDGPU::isDPALU_DPP(Desc, STI)) {
O << " /* DP ALU dpp only supports "
<< (isGFX12(STI) ? "row_share" : "row_newbcast") << " */";
return;
}
if (Imm <= DppCtrl::QUAD_PERM_LAST) {

View File

@ -6621,7 +6621,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned SplitSize = 32;
if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
ST->hasDPALU_DPP() &&
AMDGPU::isLegalDPALU_DPPControl(N->getConstantOperandVal(3)))
AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
SplitSize = 64;
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,

View File

@ -2616,9 +2616,9 @@ std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
if (ST.hasMovB64() &&
if (ST.hasMovB64() && ST.hasFeature(AMDGPU::FeatureDPALU_DPP) &&
AMDGPU::isLegalDPALU_DPPControl(
getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
ST, getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
return std::pair(&MI, nullptr);
}
@ -5433,7 +5433,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
!AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
!AMDGPU::isLegalDPALU_DPPControl(ST, DC) &&
AMDGPU::isDPALU_DPP(Desc, ST)) {
ErrInfo = "Invalid dpp_ctrl value: "
"DP ALU dpp only support row_newbcast";
return false;

View File

@ -1954,6 +1954,7 @@ class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
!eq(VT, v2f16) : VCSrc_v2f16,
!eq(VT, v2bf16) : VCSrc_v2bf16,
!eq(VT, f32) : VCSrc_f32,
!eq(VT, f64) : VCSrc_f64,
!eq(VT, v2i32) : VCSrc_v2b32,
1 : VCSrc_b32);
}

View File

@ -3309,7 +3309,33 @@ bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
return false;
}
bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
bool isDPALU_DPP32BitOpc(unsigned Opc) {
switch (Opc) {
case AMDGPU::V_MUL_LO_U32_e64:
case AMDGPU::V_MUL_LO_U32_e64_dpp:
case AMDGPU::V_MUL_LO_U32_e64_dpp_gfx1250:
case AMDGPU::V_MUL_HI_U32_e64:
case AMDGPU::V_MUL_HI_U32_e64_dpp:
case AMDGPU::V_MUL_HI_U32_e64_dpp_gfx1250:
case AMDGPU::V_MUL_HI_I32_e64:
case AMDGPU::V_MUL_HI_I32_e64_dpp:
case AMDGPU::V_MUL_HI_I32_e64_dpp_gfx1250:
case AMDGPU::V_MAD_U32_e64:
case AMDGPU::V_MAD_U32_e64_dpp:
case AMDGPU::V_MAD_U32_e64_dpp_gfx1250:
return true;
default:
return false;
}
}
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) {
if (!ST.hasFeature(AMDGPU::FeatureDPALU_DPP))
return false;
if (isDPALU_DPP32BitOpc(OpDesc.getOpcode()))
return ST.hasFeature(AMDGPU::FeatureGFX1250Insts);
return hasAny64BitVGPROperands(OpDesc);
}

View File

@ -1750,15 +1750,22 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
LLVM_READNONE
inline bool isLegalDPALU_DPPControl(unsigned DC) {
return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
inline bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC) {
if (isGFX12(ST))
return DC >= DPP::ROW_SHARE_FIRST && DC <= DPP::ROW_SHARE_LAST;
if (isGFX90A(ST))
return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
return false;
}
/// \returns true if an instruction may have a 64-bit VGPR operand.
bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
/// \returns true if an instruction is a DP ALU DPP without any 64-bit operands.
bool isDPALU_DPP32BitOpc(unsigned Opc);
/// \returns true if an instruction is a DP ALU DPP.
bool isDPALU_DPP(const MCInstrDesc &OpDesc);
bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST);
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);

View File

@ -2084,6 +2084,9 @@ multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> :
multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> :
VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>;
multiclass VOP3_Real_Base_gfx11_gfx12_not_gfx1250<bits<10> op> :
VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Not12_50Gen, op>;
multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
string asmName> :
VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
@ -2211,9 +2214,9 @@ defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>;
defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>;
defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>;
defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>;
defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>;
defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>;
defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>;
defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>;
defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
@ -2242,6 +2245,10 @@ let AssemblerPredicate = isGFX11Plus in {
}
// These instructions differ from GFX12 variant by supporting DPP:
defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>;
defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>;
defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>;
defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
defm V_PERM_PK16_B8_U4 : VOP3Only_Real_Base_gfx1250<0x243>;

View File

@ -1,12 +1,13 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,GFX942
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s -check-prefixes=GCN,DPP64,GFX90A,DPP64-GFX9 -DCTL=row_newbcast
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GCN,DPP64,DPPMOV64,DPP64-GFX9,GFX942 -DCTL=row_newbcast
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX10 -DCTL=row_share
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX10PLUS,GFX11 -DCTL=row_share
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GCN,DPP32,GFX1250 -DCTL=row_share
; GCN-LABEL: {{^}}dpp64_ceil:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPP64: v_ceil_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP64: v_ceil_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
@ -21,8 +22,8 @@ define amdgpu_kernel void @dpp64_ceil(ptr addrspace(1) %arg, i64 %in1) {
; GCN-LABEL: {{^}}dpp64_rcp:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPP64: v_rcp_f64_dpp [[V]], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP64-GFX9: v_rcp_f64_dpp [[V]], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
define amdgpu_kernel void @dpp64_rcp(ptr addrspace(1) %arg, i64 %in1) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i64, ptr addrspace(1) %arg, i32 %id
@ -52,9 +53,9 @@ define amdgpu_kernel void @dpp64_rcp_unsupported_ctl(ptr addrspace(1) %arg, i64
; GCN-LABEL: {{^}}dpp64_div:
; GCN: global_load_{{dwordx2|b64}} [[V:v\[[0-9:]+\]]],
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_newbcast:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX10PLUS-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} row_share:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPPMOV64: v_mov_b64_dpp v[{{[0-9:]+}}], [[V]] [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX90A-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP32-COUNT-2: v_mov_b32_dpp v{{[0-9]+}}, v{{[0-9]+}} [[CTL]]:1 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GCN: v_div_scale_f64
; GCN: v_rcp_f64_e32
define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
@ -69,6 +70,25 @@ define amdgpu_kernel void @dpp64_div(ptr addrspace(1) %arg, i64 %in1) {
ret void
}
; On GFX9 it fails to combine because v_mul_lo_u32 has no e32 or dpp form.
; GCN-LABEL: {{^}}dpp_mul_row_share:
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
; DPP64-GFX9: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
; DPP64-GFX9: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; DPP64-GFX9: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
; GFX1250: v_mov_b32_e32 [[V2:v[0-9]+]], [[V]]
; GFX1250: v_mov_b32_dpp [[V2]], [[V2]] {{row_share|row_newbcast}}:0 row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
; GFX1250: v_mul_lo_u32 [[V]], [[V2]], [[V]]{{$}}
define amdgpu_kernel void @dpp_mul_row_share(ptr addrspace(1) %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
%load = load i32, ptr addrspace(1) %gep
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 336, i32 15, i32 15, i1 1)
%mul = mul i32 %tmp0, %load
store i32 %mul, ptr addrspace(1) %gep
ret void
}
; GCN-LABEL: {{^}}dpp64_loop:
; GCN: v_mov_b32_dpp
; DPP64: v_mov_b32_dpp