[AMDGPU] Add FeatureGFX13 and SMEM encoding for gfx13 (#177567)

For now list of features is based on gfx12 and gfx1250

---------

Co-authored-by: Jay Foad <jay.foad@amd.com>
This commit is contained in:
Mariusz Sikora 2026-01-26 14:16:36 +01:00 committed by GitHub
parent 7cd5b2bffb
commit 3c0f5045e1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
22 changed files with 1694 additions and 44 deletions

View File

@ -496,6 +496,11 @@ defm GFX1250Insts : AMDGPUSubtargetFeature<"gfx1250-insts",
/*GenPredicate=*/0
>;
defm GFX13Insts : AMDGPUSubtargetFeature<"gfx13-insts",
"Additional instructions for GFX13+",
/*GenPredicate=*/0
>;
defm GFX10_3Insts : AMDGPUSubtargetFeature<"gfx10-3-insts",
"Additional instructions for GFX10.3",
/*GenPredicate=*/0
@ -1431,6 +1436,29 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
]
>;
def FeatureGFX13 : GCNSubtargetFeatureGeneration<"GFX13",
"gfx13",
[FeatureFP64, FeatureMIMG_R128,
FeatureFlatAddressSpace, Feature16BitInsts,
FeatureInv2PiInlineImm, FeatureApertureRegs,
FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
FeatureGFX11Insts, FeatureGFX12Insts, FeatureGFX13Insts, FeatureVOP3PInsts,
FeatureVOPDInsts, FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
FeatureAddNoCarryInsts, FeatureFmaMixInsts,
FeatureNoSdstCMPX, FeatureVscnt,
FeatureVOP3Literal, FeatureDPP8,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess,
FeatureUnalignedDSAccess, FeatureTrue16BitInsts,
FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32,
FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts,
FeatureIEEEMinimumMaximumInsts, FeatureMinimum3Maximum3F32,
FeatureMinimum3Maximum3F16, FeatureAgentScopeFineGrainedRemoteMemoryAtomics
]
>;
//===----------------------------------------------------------------------===//
class FeatureSet<list<SubtargetFeature> Features_> {
@ -1989,6 +2017,65 @@ def FeatureISAVersion12_Generic: FeatureSet<
!listconcat(FeatureISAVersion12.Features,
[FeatureRequiresCOV6])>;
def FeatureISAVersion13 : FeatureSet<
[FeatureGFX13,
FeatureGFX1250Insts,
FeatureAddressableLocalMemorySize65536,
Feature64BitLiterals,
FeatureLDSBankCount32,
FeatureDLInsts,
FeatureFmacF64Inst,
FeatureDot7Insts,
FeatureDot8Insts,
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureShaderCyclesRegister,
FeatureArchitectedFlatScratch,
FeatureArchitectedSGPRs,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
FeatureAtomicDsPkAdd16Insts,
FeatureAtomicFlatPkAdd16Insts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeatureAtomicGlobalPkAddBF16Inst,
FeatureAtomicBufferPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureFP8ConversionInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
FeaturePseudoScalarTrans,
FeatureRestrictedSOffset,
FeatureScalarDwordx3Loads,
FeatureDPPSrc1SGPR,
FeatureBitOp3Insts,
FeatureTanhInsts,
FeatureTensorCvtLutInsts,
FeatureTransposeLoadF4F6Insts,
Feature1_5xVGPRs,
FeatureBF16TransInsts,
FeatureBF16ConversionInsts,
FeatureBF16PackedInsts,
FeaturePrngInst,
FeaturePermlane16Swap,
FeatureAshrPkInsts,
FeatureAtomicFMinFMaxF64GlobalInsts,
FeatureAtomicFMinFMaxF64FlatInsts,
FeatureFmaMixBF16Insts,
FeatureGloballyAddressableScratch,
FeatureCvtPkF16F32Inst,
FeatureF16BF16ToFP6BF6ConversionScaleInsts,
FeatureIEEEMinimumMaximumInsts,
FeatureClusters,
FeatureCubeInsts,
FeatureLerpInst,
FeatureSadInsts,
FeatureQsadInsts,
FeatureCvtNormInsts,
FeatureCvtPkNormVOP2Insts,
FeatureCvtPkNormVOP3Insts,
]>;
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@ -2273,7 +2360,7 @@ def isGFX11Plus :
def isGFX12Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">,
AssemblerPredicate<(all_of FeatureGFX12Insts)>;
AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX13Insts))>;
def isGFX12Not12_50 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
@ -2284,12 +2371,13 @@ def isGFX12Plus :
AssemblerPredicate<(all_of FeatureGFX12Insts)>;
def isGFX12PlusNot12_50 :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 && !Subtarget->hasGFX1250Insts()">,
AssemblerPredicate<(all_of FeatureGFX12Insts, (not FeatureGFX1250Insts))>;
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12 &&"
"(Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13 || !Subtarget->hasGFX1250Insts())">,
AssemblerPredicate<(all_of FeatureGFX12Insts, (any_of FeatureGFX13Insts, (not FeatureGFX1250Insts)))>;
def isGFX125xOnly :
Predicate<"Subtarget->hasGFX1250Insts()">,
AssemblerPredicate<(all_of FeatureGFX1250Insts)>;
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12 && Subtarget->hasGFX1250Insts()">,
AssemblerPredicate<(all_of FeatureGFX1250Insts, (not FeatureGFX13Insts))>;
def isGFX1250Plus :
Predicate<"Subtarget->hasGFX1250Insts()">,
@ -2300,8 +2388,18 @@ def isNotGFX1250Plus :
AssemblerPredicate<(all_of (not FeatureGFX1250Insts))>;
def isGFX940orGFX1250 :
Predicate<"Subtarget->hasGFX940Insts() || Subtarget->hasGFX1250Insts()">,
AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX1250Insts)>;
Predicate<"Subtarget->hasGFX940Insts() ||"
"(Subtarget->hasGFX1250Insts() && !Subtarget->hasGFX13Insts())">,
AssemblerPredicate<(any_of FeatureGFX940Insts,
(all_of FeatureGFX1250Insts, (not FeatureGFX13Insts)))>;
def isGFX13Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX13">,
AssemblerPredicate<(all_of FeatureGFX13Insts)>;
def isGFX13Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX13">,
AssemblerPredicate<(all_of FeatureGFX13Insts)>;
def HasAtomicCondSubClampFlatInsts :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,

View File

@ -660,7 +660,7 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
(void)PGRM_Rsrc3;
(void)EvaluatableRsrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) || !EvaluatableRsrc3 ||
STM.hasGFX90AInsts() || STM.hasGFX1250Insts() || !EvaluatableRsrc3 ||
static_cast<uint64_t>(PGRM_Rsrc3) == 0);
KernelDescriptor.compute_pgm_rsrc3 = CurrentProgramInfo.ComputePGMRSrc3;
@ -831,7 +831,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
" AccumOffset: " + getMCExprStr(AdjustedAccum), false);
}
if (AMDGPU::isGFX1250(STM))
if (STM.hasGFX1250Insts())
OutStreamer->emitRawComment(
" NamedBarCnt: " + getMCExprStr(CurrentProgramInfo.NamedBarCnt),
false);
@ -867,7 +867,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
[[maybe_unused]] int64_t PGMRSrc3;
assert(STM.getGeneration() >= AMDGPUSubtarget::GFX10 ||
STM.hasGFX90AInsts() || AMDGPU::isGFX1250(STM) ||
STM.hasGFX90AInsts() || STM.hasGFX1250Insts() ||
(CurrentProgramInfo.ComputePGMRSrc3->evaluateAsAbsolute(PGMRSrc3) &&
static_cast<uint64_t>(PGMRSrc3) == 0));
if (STM.hasGFX90AInsts()) {
@ -1288,7 +1288,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
amdhsa::COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT_SHIFT);
}
if (AMDGPU::isGFX1250(STM))
if (STM.hasGFX1250Insts())
ProgInfo.ComputePGMRSrc3 =
SetBits(ProgInfo.ComputePGMRSrc3, ProgInfo.NamedBarCnt,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,

View File

@ -71,7 +71,7 @@ public:
if (SIInstrInfo::isTRANS(MI))
return TRANS;
// WMMA XDL ops are treated the same as TRANS.
if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI))
if (ST->hasGFX1250Insts() && SII->isXDLWMMA(MI))
return TRANS;
if (SIInstrInfo::isVALU(MI))
return VALU;

View File

@ -6153,7 +6153,7 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
// being added, so we can only safely match a 32-bit addition with no unsigned
// overflow.
bool CheckNUW = AMDGPU::isGFX1250(ST);
bool CheckNUW = ST.hasGFX1250Insts();
std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);

View File

@ -42,6 +42,7 @@ public:
GFX10 = 9,
GFX11 = 10,
GFX12 = 11,
GFX13 = 12,
};
private:

View File

@ -1551,6 +1551,12 @@ public:
bool isGFX1250() const { return AMDGPU::isGFX1250(getSTI()); }
bool isGFX1250Plus() const { return AMDGPU::isGFX1250Plus(getSTI()); }
bool isGFX13() const { return AMDGPU::isGFX13(getSTI()); }
bool isGFX13Plus() const { return AMDGPU::isGFX13Plus(getSTI()); }
bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); }
bool isGFX10_BEncoding() const {
@ -2931,7 +2937,7 @@ MCRegister AMDGPUAsmParser::getRegularReg(RegisterKind RegKind, unsigned RegNum,
return AMDGPU::NoRegister;
}
if (RegKind == IS_VGPR && !isGFX1250() && RegIdx + RegWidth / 32 > 256) {
if (RegKind == IS_VGPR && !isGFX1250Plus() && RegIdx + RegWidth / 32 > 256) {
Error(Loc, "register index is out of range");
return MCRegister();
}
@ -3953,7 +3959,7 @@ AMDGPUAsmParser::checkVOPDRegBankConstraints(const MCInst &Inst, bool AsVOPD3) {
bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx1250 ||
Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_e96_gfx1250;
bool AllowSameVGPR = isGFX1250();
bool AllowSameVGPR = isGFX1250Plus();
if (AsVOPD3) { // Literal constants are not allowed with VOPD3.
for (auto OpName : {OpName::src0X, OpName::src0Y}) {
@ -4087,7 +4093,7 @@ bool AMDGPUAsmParser::tryVOPD(const MCInst &Inst) {
// form but switch to VOPD3 otherwise.
bool AMDGPUAsmParser::tryAnotherVOPDEncoding(const MCInst &Inst) {
const unsigned Opcode = Inst.getOpcode();
if (!isGFX1250() || !isVOPD(Opcode))
if (!isGFX1250Plus() || !isVOPD(Opcode))
return false;
if (MII.get(Opcode).TSFlags & SIInstrFlags::VOPD3)
@ -5377,7 +5383,7 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
if (!isGFX1250()) {
if (!isGFX1250Plus()) {
if (CPol & CPol::SCAL) {
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
StringRef CStr(S.getPointer());
@ -6176,7 +6182,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
AccumOffset = ExprVal;
} else if (ID == ".amdhsa_named_barrier_count") {
if (!isGFX1250())
if (!isGFX1250Plus())
return Error(IDRange.Start, "directive requires gfx1250+", IDRange);
NamedBarCnt = ExprVal;
} else if (ID == ".amdhsa_reserve_vcc") {
@ -6376,7 +6382,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return TokError("amdgpu_user_sgpr_count smaller than than implied by "
"enabled user SGPRs");
if (isGFX1250()) {
if (isGFX1250Plus()) {
if (!isUInt<COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
return TokError("too many user SGPRs enabled");
AMDGPU::MCKernelDescriptor::bits_set(
@ -6431,7 +6437,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
getContext());
}
if (isGFX1250())
if (isGFX1250Plus())
MCKernelDescriptor::bits_set(KD.compute_pgm_rsrc3, NamedBarCnt,
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,

View File

@ -580,7 +580,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try to decode DPP and SDWA first to solve conflict with VOP1 and VOP2
// encodings
if (isGFX1250() && Bytes.size() >= 16) {
if (isGFX1250Plus() && Bytes.size() >= 16) {
std::bitset<128> DecW = eat16Bytes(Bytes);
if (tryDecodeInst(DecoderTableGFX1250128, MI, DecW, Address, CS))
break;
@ -694,6 +694,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
tryDecodeInst(DecoderTableGFX12W6464, MI, QW, Address, CS))
break;
if (isGFX13() && tryDecodeInst(DecoderTableGFX1364, MI, QW, Address, CS))
break;
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
}
@ -2242,6 +2245,16 @@ bool AMDGPUDisassembler::isGFX12Plus() const {
bool AMDGPUDisassembler::isGFX1250() const { return AMDGPU::isGFX1250(STI); }
bool AMDGPUDisassembler::isGFX1250Plus() const {
return AMDGPU::isGFX1250Plus(STI);
}
bool AMDGPUDisassembler::isGFX13() const { return AMDGPU::isGFX13(STI); }
bool AMDGPUDisassembler::isGFX13Plus() const {
return AMDGPU::isGFX13Plus(STI);
}
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
}
@ -2398,7 +2411,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
}
// Bits [27].
if (isGFX1250()) {
if (isGFX1250Plus()) {
PRINT_PSEUDO_DIRECTIVE_COMMENT("FLAT_SCRATCH_IS_NV",
COMPUTE_PGM_RSRC1_GFX125_FLAT_SCRATCH_IS_NV);
} else {
@ -2412,7 +2425,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
// Bits [29-31].
if (isGFX10Plus()) {
// WGP_MODE is not available on GFX1250.
if (!isGFX1250()) {
if (!isGFX1250Plus()) {
PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
}
@ -2543,7 +2556,7 @@ Expected<bool> AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
}
// Bits [14-21].
if (isGFX1250()) {
if (isGFX1250Plus()) {
PRINT_DIRECTIVE(".amdhsa_named_barrier_count",
COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT);
PRINT_PSEUDO_DIRECTIVE_COMMENT(

View File

@ -182,6 +182,9 @@ public:
bool isGFX12() const;
bool isGFX12Plus() const;
bool isGFX1250() const;
bool isGFX1250Plus() const;
bool isGFX13() const;
bool isGFX13Plus() const;
bool hasArchitectedFlatScratch() const;
bool hasKernargPreload() const;

View File

@ -1932,7 +1932,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) {
if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled.
if (!ST.hasGFX1250Insts() || // Coexecution disabled.
!SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI))
return false;
@ -2077,7 +2077,7 @@ static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
}
int GCNHazardRecognizer::checkWMMACoexecutionHazards(MachineInstr *MI) {
if (!AMDGPU::isGFX1250(ST))
if (!ST.hasGFX1250Insts())
return 0;
const SIInstrInfo *TII = ST.getInstrInfo();

View File

@ -339,5 +339,5 @@ def : ProcessorModel<"gfx1251", GFX1250SpeedModel,
//===----------------------------------------------------------------------===//
def : ProcessorModel<"gfx1310", GFX12SpeedModel,
FeatureISAVersion12_50.Features
FeatureISAVersion13.Features
>;

View File

@ -401,7 +401,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
EmitMCExpr(KD.kernarg_size);
OS << '\n';
if (isGFX1250(STI)) {
if (isGFX1250Plus(STI)) {
PrintField(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC2_GFX125_USER_SGPR_COUNT,
@ -515,7 +515,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << '\n';
}
if (AMDGPU::isGFX1250(STI))
if (isGFX1250Plus(STI))
PrintField(KD.compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT,
amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,

View File

@ -46,6 +46,7 @@ enum {
GFX11 = 10,
GFX12 = 11,
GFX1250 = 12,
GFX13 = 13,
};
}

View File

@ -11743,7 +11743,7 @@ SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
// On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
// being added, so we can only safely match a 32-bit addition with no
// unsigned overflow.
bool CheckNUW = AMDGPU::isGFX1250(*Subtarget);
bool CheckNUW = Subtarget->hasGFX1250Insts();
if (!CheckNUW || isNoUnsignedWrap(N0)) {
C1 = cast<ConstantSDNode>(N0.getOperand(1));
N0 = N0.getOperand(0);

View File

@ -10239,6 +10239,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
case AMDGPUSubtarget::GFX12:
return ST.hasGFX1250Insts() ? SIEncodingFamily::GFX1250
: SIEncodingFamily::GFX12;
case AMDGPUSubtarget::GFX13:
return SIEncodingFamily::GFX13;
}
llvm_unreachable("Unknown subtarget generation!");
}
@ -11260,7 +11262,7 @@ bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const {
if (!isWMMA(MI) && !isSWMMAC(MI))
return false;
if (AMDGPU::isGFX1250(ST))
if (ST.hasGFX1250Insts())
return AMDGPU::getWMMAIsXDL(MI.getOpcode());
return true;

View File

@ -27,6 +27,7 @@ def SIEncodingFamily {
int GFX11 = 10;
int GFX12 = 11;
int GFX1250 = 12;
int GFX13 = 13;
}
//===----------------------------------------------------------------------===//
@ -3365,7 +3366,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.GFX940)],
[!cast<string>(SIEncodingFamily.GFX11)],
[!cast<string>(SIEncodingFamily.GFX12)],
[!cast<string>(SIEncodingFamily.GFX1250)]];
[!cast<string>(SIEncodingFamily.GFX1250)],
[!cast<string>(SIEncodingFamily.GFX13)]];
}
// Get equivalent SOPK instruction.

View File

@ -512,9 +512,9 @@ protected:
public:
SIGfx12CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {
// GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX12.0 in CU mode.
assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
// GFX120x and GFX125x memory models greatly overlap, and in some cases
// the behavior is the same if assuming GFX120x in CU mode.
assert(!ST.hasGFX1250Insts() || ST.hasGFX13Insts() || ST.isCuModeEnabled());
}
bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,

View File

@ -1464,7 +1464,7 @@ class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
SGPR_NULL_gfx11plus> {
let AssemblerPredicate = isGFX12Plus;
let AssemblerPredicate = isGFX12Only;
let DecoderNamespace = "GFX12";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
@ -1537,3 +1537,84 @@ multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>;
defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
//===----------------------------------------------------------------------===//
// GFX13.
//===----------------------------------------------------------------------===//
class SMEM_Real_gfx13<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX13,
SGPR_NULL_gfx11plus> {
let AssemblerPredicate = isGFX13Plus;
let DecoderNamespace = "GFX13";
let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
}
class SMEM_Real_Prefetch_gfx13<bits<6> op, SM_Pseudo ps> :
SMEM_Real_gfx13<op, ps> {
bits<7> sdata; // Only 5 bits of sdata are supported.
let sdst = ?;
let Inst{12-11} = 0; // Unused sdata bits.
let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?);
}
class SMEM_Real_Load_gfx13<bits<6> op, string ps, string opName, OffsetMode offsets> :
SMEM_Real_gfx13<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
let Inst{20} = cpol{CPolBit.NV}; // non-volatile
let Inst{22-21} = cpol{4-3}; // scope
let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
let Inst{56} = cpol{CPolBit.SCAL}; // scale offset
}
multiclass SM_Real_Loads_gfx13<bits<6> op, string ps = NAME> {
defvar opName = !tolower(NAME);
def _IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, IMM_Offset>;
def _SGPR_IMM_gfx13 : SMEM_Real_Load_gfx13<op, ps, opName, SGPR_IMM_OptOffset>;
}
defm S_LOAD_B32 : SM_Real_Loads_gfx13<0x00, "S_LOAD_DWORD">;
defm S_LOAD_B64 : SM_Real_Loads_gfx13<0x01, "S_LOAD_DWORDX2">;
defm S_LOAD_B96 : SM_Real_Loads_gfx13<0x0e, "S_LOAD_DWORDX3">;
defm S_LOAD_B128 : SM_Real_Loads_gfx13<0x02, "S_LOAD_DWORDX4">;
defm S_LOAD_B256 : SM_Real_Loads_gfx13<0x03, "S_LOAD_DWORDX8">;
defm S_LOAD_B512 : SM_Real_Loads_gfx13<0x04, "S_LOAD_DWORDX16">;
defm S_LOAD_I8 : SM_Real_Loads_gfx13<0x30>;
defm S_LOAD_U8 : SM_Real_Loads_gfx13<0x31>;
defm S_LOAD_I16 : SM_Real_Loads_gfx13<0x32>;
defm S_LOAD_U16 : SM_Real_Loads_gfx13<0x33>;
defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx13<0x08, "S_BUFFER_LOAD_DWORD">;
defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx13<0x09, "S_BUFFER_LOAD_DWORDX2">;
defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx13<0x0d, "S_BUFFER_LOAD_DWORDX3">;
defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx13<0x0a, "S_BUFFER_LOAD_DWORDX4">;
defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx13<0x0b, "S_BUFFER_LOAD_DWORDX8">;
defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx13<0x0c, "S_BUFFER_LOAD_DWORDX16">;
defm S_BUFFER_LOAD_I8 : SM_Real_Loads_gfx13<0x34>;
defm S_BUFFER_LOAD_U8 : SM_Real_Loads_gfx13<0x35>;
defm S_BUFFER_LOAD_I16 : SM_Real_Loads_gfx13<0x36>;
defm S_BUFFER_LOAD_U16 : SM_Real_Loads_gfx13<0x37>;
def S_DCACHE_INV_gfx13 : SMEM_Real_gfx13<0x020, S_DCACHE_INV>;
def S_PREFETCH_INST_gfx13 : SMEM_Real_Prefetch_gfx13<0x22, S_PREFETCH_INST>;
def S_PREFETCH_INST_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x23, S_PREFETCH_INST_PC_REL>;
def S_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2c, S_PREFETCH_DATA>;
def S_BUFFER_PREFETCH_DATA_gfx13 : SMEM_Real_Prefetch_gfx13<0x2d, S_BUFFER_PREFETCH_DATA>;
def S_PREFETCH_DATA_PC_REL_gfx13 : SMEM_Real_Prefetch_gfx13<0x2e, S_PREFETCH_DATA_PC_REL>;
multiclass SMEM_Real_Probe_gfx13<bits<6> op> {
defvar ps = NAME;
def _IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
def _SGPR_IMM_gfx13 : SMEM_Real_Prefetch_gfx13<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_OPT_IMM)>;
}
defm S_ATC_PROBE : SMEM_Real_Probe_gfx13<0x26>;
defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx13<0x27>;

View File

@ -99,7 +99,7 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10},
{{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus},
{{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10},
{{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250},
{{"MSG_SAVEWAVE_HAS_TDM"}, ID_SAVEWAVE_HAS_TDM, isGFX1250Plus},
{{"MSG_GET_DDID"}, ID_GET_DDID, isGFX10},
{{"MSG_SYSMSG"}, ID_SYSMSG},
{{"MSG_RTN_GET_DOORBELL"}, ID_RTN_GET_DOORBELL, isGFX11Plus},
@ -111,7 +111,7 @@ static constexpr CustomOperand MsgOperands[] = {
{{"MSG_RTN_GET_TBA_TO_PC"}, ID_RTN_GET_TBA_TO_PC, isGFX11Plus},
{{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus},
{{"MSG_RTN_GET_CLUSTER_BARRIER_STATE"}, ID_RTN_GET_CLUSTER_BARRIER_STATE,
isGFX1250},
isGFX1250Plus},
};
static constexpr CustomOperand SysMsgOperands[] = {
@ -213,7 +213,7 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{"HW_REG_WAVE_SCHED_MODE"}, ID_SCHED_MODE, isGFX12Plus},
{{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
{{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250},
{{"HW_REG_IB_STS2"}, ID_IB_STS2, isGFX1250Plus},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
{{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus},
{{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
@ -221,8 +221,8 @@ static constexpr CustomOperand Operands[] = {
{{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
{{"HW_REG_WAVE_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus},
{{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus},
{{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250},
{{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250},
{{"HW_REG_XNACK_STATE_PRIV"}, ID_XNACK_STATE_PRIV, isGFX1250Plus},
{{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK_gfx1250, isGFX1250Plus},
};
// clang-format on

View File

@ -2514,7 +2514,7 @@ unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
}
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) {
if (isGFX1250(STI))
if (isGFX1250Plus(STI))
return 32;
return 16;
}
@ -2581,14 +2581,26 @@ bool isGFX12(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX12];
}
bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); }
bool isGFX12Plus(const MCSubtargetInfo &STI) {
return isGFX12(STI) || isGFX13Plus(STI);
}
bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); }
bool isGFX1250(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts] && !isGFX13(STI);
}
bool isGFX1250Plus(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX1250Insts];
}
bool isGFX13(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX13];
}
bool isGFX13Plus(const MCSubtargetInfo &STI) { return isGFX13(STI); }
bool supportsWGP(const MCSubtargetInfo &STI) {
if (isGFX1250(STI))
return false;

View File

@ -1599,6 +1599,9 @@ bool isGFX11Plus(const MCSubtargetInfo &STI);
bool isGFX12(const MCSubtargetInfo &STI);
bool isGFX12Plus(const MCSubtargetInfo &STI);
bool isGFX1250(const MCSubtargetInfo &STI);
bool isGFX1250Plus(const MCSubtargetInfo &STI);
bool isGFX13(const MCSubtargetInfo &STI);
bool isGFX13Plus(const MCSubtargetInfo &STI);
bool supportsWGP(const MCSubtargetInfo &STI);
bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);

View File

@ -21,7 +21,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck --check-prefixes=GCN,GFX1250 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s
; GCN-LABEL: {{^}}max_occupancy:
; GFX9: ; Occupancy: 10

File diff suppressed because it is too large Load Diff