llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Alexander Richardson 07e2ba445d
[AMDGPU] Set AS8 address width to 48 bits
Of the 128-bits of buffer descriptor only 48 bits are address bits, so
following the discussion on https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54,
the logic conclusion is to set the index width to 48 bits instead of
the current value of 128.

Most of the test changes are mechanical datalayout updates, but there
is one actual change: the ptrmask test now uses .i48 instead of .i128
and I had to update SelectionDAGBuilder to correctly extend the mask.

Reviewed By: krzysz00

Pull Request: https://github.com/llvm/llvm-project/pull/139419
2025-05-19 17:26:05 -07:00

2305 lines
84 KiB
C++

//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This file contains both AMDGPU target machine and the CodeGen pass builder.
/// The AMDGPU target machine contains all of the hardware specific information
/// needed to emit code for SI+ GPUs in the legacy pass manager pipeline. The
/// CodeGen pass builder handles the pass pipeline for new pass manager.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUTargetMachine.h"
#include "AMDGPU.h"
#include "AMDGPUAliasAnalysis.h"
#include "AMDGPUCtorDtorLowering.h"
#include "AMDGPUExportClustering.h"
#include "AMDGPUExportKernelRuntimeHandles.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUISelDAGToDAG.h"
#include "AMDGPUMacroFusion.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUPreloadKernArgProlog.h"
#include "AMDGPURemoveIncompatibleFunctions.h"
#include "AMDGPUReserveWWMRegs.h"
#include "AMDGPUSplitModule.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUUnifyDivergentExitNodes.h"
#include "AMDGPUWaitSGPRHazards.h"
#include "GCNDPPCombine.h"
#include "GCNIterativeScheduler.h"
#include "GCNNSAReassign.h"
#include "GCNPreRALongBranchReg.h"
#include "GCNPreRAOptimizations.h"
#include "GCNRewritePartialRegUses.h"
#include "GCNSchedStrategy.h"
#include "GCNVOPDUtils.h"
#include "R600.h"
#include "R600TargetMachine.h"
#include "SIFixSGPRCopies.h"
#include "SIFixVGPRCopies.h"
#include "SIFoldOperands.h"
#include "SIFormMemoryClauses.h"
#include "SILoadStoreOptimizer.h"
#include "SILowerControlFlow.h"
#include "SILowerSGPRSpills.h"
#include "SILowerWWMCopies.h"
#include "SIMachineFunctionInfo.h"
#include "SIMachineScheduler.h"
#include "SIOptimizeExecMasking.h"
#include "SIOptimizeExecMaskingPreRA.h"
#include "SIOptimizeVGPRLiveRange.h"
#include "SIPeepholeSDWA.h"
#include "SIPostRABundler.h"
#include "SIPreAllocateWWMRegs.h"
#include "SIShrinkInstructions.h"
#include "SIWholeQuadMode.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/CGSCCPassManager.h"
#include "llvm/Analysis/CallGraphSCCPass.h"
#include "llvm/Analysis/KernelInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/AtomicExpand.h"
#include "llvm/CodeGen/BranchRelaxation.h"
#include "llvm/CodeGen/DeadMachineInstructionElim.h"
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
#include "llvm/CodeGen/GlobalISel/Legalizer.h"
#include "llvm/CodeGen/GlobalISel/Localizer.h"
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineCSE.h"
#include "llvm/CodeGen/MachineLICM.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/PostRAHazardRecognizer.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Transforms/HipStdPar/HipStdPar.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/ExpandVariadics.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
#include "llvm/Transforms/IPO/Internalize.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/EarlyCSE.h"
#include "llvm/Transforms/Scalar/FlattenCFG.h"
#include "llvm/Transforms/Scalar/GVN.h"
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
#include "llvm/Transforms/Scalar/NaryReassociate.h"
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
#include "llvm/Transforms/Scalar/Sink.h"
#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
#include "llvm/Transforms/Scalar/StructurizeCFG.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/FixIrreducible.h"
#include "llvm/Transforms/Utils/LCSSA.h"
#include "llvm/Transforms/Utils/LowerSwitch.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include <optional>
using namespace llvm;
using namespace llvm::PatternMatch;
namespace {
class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
public:
SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};
class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
public:
VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};
class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
public:
WWMRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
: RegisterRegAllocBase(N, D, C) {}
};
static bool onlyAllocateSGPRs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
static bool onlyAllocateVGPRs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC);
}
static bool onlyAllocateWWMRegs(const TargetRegisterInfo &TRI,
const MachineRegisterInfo &MRI,
const Register Reg) {
const SIMachineFunctionInfo *MFI =
MRI.getMF().getInfo<SIMachineFunctionInfo>();
const TargetRegisterClass *RC = MRI.getRegClass(Reg);
return !static_cast<const SIRegisterInfo &>(TRI).isSGPRClass(RC) &&
MFI->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
}
/// -{sgpr|wwm|vgpr}-regalloc=... command line option.
static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
/// A dummy default pass factory indicates whether the register allocator is
/// overridden on the command line.
static llvm::once_flag InitializeDefaultSGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
static SGPRRegisterRegAlloc
defaultSGPRRegAlloc("default",
"pick SGPR register allocator based on -O option",
useDefaultRegisterAllocator);
static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<SGPRRegisterRegAlloc>>
SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for SGPRs"));
static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<VGPRRegisterRegAlloc>>
VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for VGPRs"));
static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
RegisterPassParser<WWMRegisterRegAlloc>>
WWMRegAlloc("wwm-regalloc", cl::Hidden,
cl::init(&useDefaultRegisterAllocator),
cl::desc("Register allocator to use for WWM registers"));
static void initializeDefaultSGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
if (!Ctor) {
Ctor = SGPRRegAlloc;
SGPRRegisterRegAlloc::setDefault(SGPRRegAlloc);
}
}
static void initializeDefaultVGPRRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
if (!Ctor) {
Ctor = VGPRRegAlloc;
VGPRRegisterRegAlloc::setDefault(VGPRRegAlloc);
}
}
static void initializeDefaultWWMRegisterAllocatorOnce() {
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
if (!Ctor) {
Ctor = WWMRegAlloc;
WWMRegisterRegAlloc::setDefault(WWMRegAlloc);
}
}
static FunctionPass *createBasicSGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateSGPRs);
}
static FunctionPass *createGreedySGPRRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateSGPRs);
}
static FunctionPass *createFastSGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateSGPRs, false);
}
static FunctionPass *createBasicVGPRRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateVGPRs);
}
static FunctionPass *createGreedyVGPRRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateVGPRs);
}
static FunctionPass *createFastVGPRRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateVGPRs, true);
}
static FunctionPass *createBasicWWMRegisterAllocator() {
return createBasicRegisterAllocator(onlyAllocateWWMRegs);
}
static FunctionPass *createGreedyWWMRegisterAllocator() {
return createGreedyRegisterAllocator(onlyAllocateWWMRegs);
}
static FunctionPass *createFastWWMRegisterAllocator() {
return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
}
static SGPRRegisterRegAlloc basicRegAllocSGPR(
"basic", "basic register allocator", createBasicSGPRRegisterAllocator);
static SGPRRegisterRegAlloc greedyRegAllocSGPR(
"greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
static SGPRRegisterRegAlloc fastRegAllocSGPR(
"fast", "fast register allocator", createFastSGPRRegisterAllocator);
static VGPRRegisterRegAlloc basicRegAllocVGPR(
"basic", "basic register allocator", createBasicVGPRRegisterAllocator);
static VGPRRegisterRegAlloc greedyRegAllocVGPR(
"greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
"basic register allocator",
createBasicWWMRegisterAllocator);
static WWMRegisterRegAlloc
greedyRegAllocWWMReg("greedy", "greedy register allocator",
createGreedyWWMRegisterAllocator);
static WWMRegisterRegAlloc fastRegAllocWWMReg("fast", "fast register allocator",
createFastWWMRegisterAllocator);
static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
return Phase == ThinOrFullLTOPhase::FullLTOPreLink ||
Phase == ThinOrFullLTOPhase::ThinLTOPreLink;
}
} // anonymous namespace
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
cl::init(false));
static cl::opt<bool>
OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
cl::desc("Run pre-RA exec mask optimizations"),
cl::init(true));
static cl::opt<bool>
LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
cl::desc("Lower GPU ctor / dtors to globals on the device."),
cl::init(true), cl::Hidden);
// Option to disable vectorizer for tests.
static cl::opt<bool> EnableLoadStoreVectorizer(
"amdgpu-load-store-vectorizer",
cl::desc("Enable load store vectorizer"),
cl::init(true),
cl::Hidden);
// Option to control global loads scalarization
static cl::opt<bool> ScalarizeGlobal(
"amdgpu-scalarize-global-loads",
cl::desc("Enable global load scalarization"),
cl::init(true),
cl::Hidden);
// Option to run internalize pass.
static cl::opt<bool> InternalizeSymbols(
"amdgpu-internalize-symbols",
cl::desc("Enable elimination of non-kernel functions and unused globals"),
cl::init(false),
cl::Hidden);
// Option to inline all early.
static cl::opt<bool> EarlyInlineAll(
"amdgpu-early-inline-all",
cl::desc("Inline all functions early"),
cl::init(false),
cl::Hidden);
static cl::opt<bool> RemoveIncompatibleFunctions(
"amdgpu-enable-remove-incompatible-functions", cl::Hidden,
cl::desc("Enable removal of functions when they"
"use features not supported by the target GPU"),
cl::init(true));
static cl::opt<bool> EnableSDWAPeephole(
"amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
cl::init(true));
static cl::opt<bool> EnableDPPCombine(
"amdgpu-dpp-combine",
cl::desc("Enable DPP combiner"),
cl::init(true));
// Enable address space based alias analysis
static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
cl::desc("Enable AMDGPU Alias Analysis"),
cl::init(true));
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
cl::desc("Enable amdgpu library simplifications"),
cl::init(true),
cl::Hidden);
static cl::opt<bool> EnableLowerKernelArguments(
"amdgpu-ir-lower-kernel-arguments",
cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);
static cl::opt<bool> EnableRegReassign(
"amdgpu-reassign-regs",
cl::desc("Enable register reassign optimizations on gfx10+"),
cl::init(true),
cl::Hidden);
static cl::opt<bool> OptVGPRLiveRange(
"amdgpu-opt-vgpr-liverange",
cl::desc("Enable VGPR liverange optimizations for if-else structure"),
cl::init(true), cl::Hidden);
static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
"amdgpu-atomic-optimizer-strategy",
cl::desc("Select DPP or Iterative strategy for scan"),
cl::init(ScanOptions::Iterative),
cl::values(
clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
clEnumValN(ScanOptions::Iterative, "Iterative",
"Use Iterative approach for scan"),
clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
// Enable Mode register optimization
static cl::opt<bool> EnableSIModeRegisterPass(
"amdgpu-mode-register",
cl::desc("Enable mode register pass"),
cl::init(true),
cl::Hidden);
// Enable GFX11+ s_delay_alu insertion
static cl::opt<bool>
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
cl::desc("Enable s_delay_alu insertion"),
cl::init(true), cl::Hidden);
// Enable GFX11+ VOPD
static cl::opt<bool>
EnableVOPD("amdgpu-enable-vopd",
cl::desc("Enable VOPD, dual issue of VALU in wave32"),
cl::init(true), cl::Hidden);
// Option is used in lit tests to prevent deadcoding of patterns inspected.
static cl::opt<bool>
EnableDCEInRA("amdgpu-dce-in-ra",
cl::init(true), cl::Hidden,
cl::desc("Enable machine DCE inside regalloc"));
static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
cl::desc("Adjust wave priority"),
cl::init(false), cl::Hidden);
static cl::opt<bool> EnableScalarIRPasses(
"amdgpu-scalar-ir-passes",
cl::desc("Enable scalar IR passes"),
cl::init(true),
cl::Hidden);
static cl::opt<bool>
EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
cl::desc("Enable lowering of lds to global memory pass "
"and asan instrument resulting IR."),
cl::init(true), cl::Hidden);
static cl::opt<bool, true> EnableLowerModuleLDS(
"amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
cl::Hidden);
static cl::opt<bool> EnablePreRAOptimizations(
"amdgpu-enable-pre-ra-optimizations",
cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
cl::Hidden);
static cl::opt<bool> EnablePromoteKernelArguments(
"amdgpu-enable-promote-kernel-arguments",
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
cl::Hidden, cl::init(true));
static cl::opt<bool> EnableImageIntrinsicOptimizer(
"amdgpu-enable-image-intrinsic-optimizer",
cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
cl::Hidden);
static cl::opt<bool>
EnableLoopPrefetch("amdgpu-loop-prefetch",
cl::desc("Enable loop data prefetch on AMDGPU"),
cl::Hidden, cl::init(false));
static cl::opt<std::string>
AMDGPUSchedStrategy("amdgpu-sched-strategy",
cl::desc("Select custom AMDGPU scheduling strategy."),
cl::Hidden, cl::init(""));
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
cl::Hidden);
static cl::opt<bool> EnableHipStdPar(
"amdgpu-enable-hipstdpar",
cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
cl::Hidden);
static cl::opt<bool>
EnableAMDGPUAttributor("amdgpu-attributor-enable",
cl::desc("Enable AMDGPUAttributorPass"),
cl::init(true), cl::Hidden);
static cl::opt<bool> NewRegBankSelect(
"new-reg-bank-select",
cl::desc("Run amdgpu-regbankselect and amdgpu-regbanklegalize instead of "
"regbankselect"),
cl::init(false), cl::Hidden);
static cl::opt<bool> HasClosedWorldAssumption(
"amdgpu-link-time-closed-world",
cl::desc("Whether has closed-world assumption at link time"),
cl::init(false), cl::Hidden);
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
PassRegistry *PR = PassRegistry::getPassRegistry();
initializeR600ClauseMergePassPass(*PR);
initializeR600ControlFlowFinalizerPass(*PR);
initializeR600PacketizerPass(*PR);
initializeR600ExpandSpecialInstrsPassPass(*PR);
initializeR600VectorRegMergerPass(*PR);
initializeR600EmitClauseMarkersPass(*PR);
initializeR600MachineCFGStructurizerPass(*PR);
initializeGlobalISel(*PR);
initializeAMDGPUAsmPrinterPass(*PR);
initializeAMDGPUDAGToDAGISelLegacyPass(*PR);
initializeGCNDPPCombineLegacyPass(*PR);
initializeSILowerI1CopiesLegacyPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
initializeAMDGPURegBankSelectPass(*PR);
initializeAMDGPURegBankLegalizePass(*PR);
initializeSILowerWWMCopiesLegacyPass(*PR);
initializeAMDGPUMarkLastScratchLoadLegacyPass(*PR);
initializeSILowerSGPRSpillsLegacyPass(*PR);
initializeSIFixSGPRCopiesLegacyPass(*PR);
initializeSIFixVGPRCopiesLegacyPass(*PR);
initializeSIFoldOperandsLegacyPass(*PR);
initializeSIPeepholeSDWALegacyPass(*PR);
initializeSIShrinkInstructionsLegacyPass(*PR);
initializeSIOptimizeExecMaskingPreRALegacyPass(*PR);
initializeSIOptimizeVGPRLiveRangeLegacyPass(*PR);
initializeSILoadStoreOptimizerLegacyPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPUExportKernelRuntimeHandlesLegacyPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPURegBankCombinerPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPrepareLegacyPass(*PR);
initializeAMDGPURemoveIncompatibleFunctionsLegacyPass(*PR);
initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPULowerBufferFatPointersPass(*PR);
initializeAMDGPUReserveWWMRegsLegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowLegacyPass(*PR);
initializeAMDGPUInsertDelayAluLegacyPass(*PR);
initializeSIInsertHardClausesLegacyPass(*PR);
initializeSIInsertWaitcntsLegacyPass(*PR);
initializeSIModeRegisterLegacyPass(*PR);
initializeSIWholeQuadModeLegacyPass(*PR);
initializeSILowerControlFlowLegacyPass(*PR);
initializeSIPreEmitPeepholeLegacyPass(*PR);
initializeSILateBranchLoweringLegacyPass(*PR);
initializeSIMemoryLegalizerLegacyPass(*PR);
initializeSIOptimizeExecMaskingLegacyPass(*PR);
initializeSIPreAllocateWWMRegsLegacyPass(*PR);
initializeSIFormMemoryClausesLegacyPass(*PR);
initializeSIPostRABundlerLegacyPass(*PR);
initializeGCNCreateVOPDLegacyPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignLegacyPass(*PR);
initializeGCNPreRAOptimizationsLegacyPass(*PR);
initializeGCNPreRALongBranchRegLegacyPass(*PR);
initializeGCNRewritePartialRegUsesLegacyPass(*PR);
initializeGCNRegPressurePrinterPass(*PR);
initializeAMDGPUPreloadKernArgPrologLegacyPass(*PR);
initializeAMDGPUWaitSGPRHazardsLegacyPass(*PR);
initializeAMDGPUPreloadKernelArgumentsLegacyPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return std::make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
return new SIScheduleDAGMI(C);
}
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
}
static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
static ScheduleDAGInstrs *
createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
}
static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
auto *DAG = new GCNIterativeScheduler(
C, GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto *DAG = new GCNIterativeScheduler(C, GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::Initial));
return DAG;
}
static MachineSchedRegistry
SISchedRegistry("si", "Run SI's custom scheduler",
createSIMachineScheduler);
static MachineSchedRegistry
GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
"Run GCN scheduler to maximize occupancy",
createGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry
GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
createGCNMaxILPMachineScheduler);
static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
"gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
createGCNMaxMemoryClauseMachineScheduler);
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
"gcn-iterative-max-occupancy-experimental",
"Run GCN scheduler to maximize occupancy (experimental)",
createIterativeGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry GCNMinRegSchedRegistry(
"gcn-iterative-minreg",
"Run GCN iterative scheduler for minimal register usage (experimental)",
createMinRegScheduler);
static MachineSchedRegistry GCNILPSchedRegistry(
"gcn-iterative-ilp",
"Run GCN iterative scheduler for ILP scheduling (experimental)",
createIterativeILPMachineScheduler);
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1";
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
// flat. 160-bit non-integral fat buffer pointers that include a 128-bit
// buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
// (address space 7), and 128-bit non-integral buffer resourcees (address
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
"-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-"
"v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-"
"v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9";
}
LLVM_READNONE
static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
if (!GPU.empty())
return GPU;
// Need to default to a target with flat support for HSA.
if (TT.isAMDGCN())
return TT.getOS() == Triple::AMDHSA ? "generic-hsa" : "generic";
return "r600";
}
static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
// The AMDGPU toolchain only supports generating shared objects, so we
// must always use PIC.
return Reloc::PIC_;
}
AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OptLevel)
: CodeGenTargetMachineImpl(
T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU), FS, Options,
getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
TLOF(createTLOF(getTargetTriple())) {
initAsmInfo();
if (TT.isAMDGCN()) {
if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
}
}
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
bool AMDGPUTargetMachine::EnableLowerModuleLDS = true;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
StringRef AMDGPUTargetMachine::getGPUName(const Function &F) const {
Attribute GPUAttr = F.getFnAttribute("target-cpu");
return GPUAttr.isValid() ? GPUAttr.getValueAsString() : getTargetCPU();
}
StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
Attribute FSAttr = F.getFnAttribute("target-features");
return FSAttr.isValid() ? FSAttr.getValueAsString()
: getTargetFeatureString();
}
llvm::ScheduleDAGInstrs *
AMDGPUTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
/// Predicate for Internalize pass.
static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV))
return F->isDeclaration() || F->getName().starts_with("__asan_") ||
F->getName().starts_with("__sanitizer_") ||
AMDGPU::isEntryFunctionCC(F->getCallingConv());
GV.removeDeadConstantUsers();
return !GV.use_empty();
}
void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
AAM.registerFunctionAnalysis<AMDGPUAA>();
}
static Expected<ScanOptions>
parseAMDGPUAtomicOptimizerStrategy(StringRef Params) {
if (Params.empty())
return ScanOptions::Iterative;
Params.consume_front("strategy=");
auto Result = StringSwitch<std::optional<ScanOptions>>(Params)
.Case("dpp", ScanOptions::DPP)
.Cases("iterative", "", ScanOptions::Iterative)
.Case("none", ScanOptions::None)
.Default(std::nullopt);
if (Result)
return *Result;
return make_error<StringError>("invalid parameter", inconvertibleErrorCode());
}
Expected<AMDGPUAttributorOptions>
parseAMDGPUAttributorPassOptions(StringRef Params) {
AMDGPUAttributorOptions Result;
while (!Params.empty()) {
StringRef ParamName;
std::tie(ParamName, Params) = Params.split(';');
if (ParamName == "closed-world") {
Result.IsClosedWorld = true;
} else {
return make_error<StringError>(
formatv("invalid AMDGPUAttributor pass parameter '{0}' ", ParamName)
.str(),
inconvertibleErrorCode());
}
}
return Result;
}
void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
#define GET_PASS_REGISTRY "AMDGPUPassRegistry.def"
#include "llvm/Passes/TargetPassRegistry.inc"
PB.registerPipelineEarlySimplificationEPCallback(
[](ModulePassManager &PM, OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (!isLTOPreLink(Phase)) {
// When we are not using -fgpu-rdc, we can run accelerator code
// selection relatively early, but still after linking to prevent
// eager removal of potentially reachable symbols.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
PM.addPass(AMDGPUPrintfRuntimeBindingPass());
}
if (Level == OptimizationLevel::O0)
return;
PM.addPass(AMDGPUUnifyMetadataPass());
// We don't want to run internalization at per-module stage.
if (InternalizeSymbols && !isLTOPreLink(Phase)) {
PM.addPass(InternalizePass(mustPreserveGV));
PM.addPass(GlobalDCEPass());
}
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
});
PB.registerPeepholeEPCallback(
[](FunctionPassManager &FPM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify)
FPM.addPass(AMDGPUSimplifyLibCallsPass());
});
PB.registerCGSCCOptimizerLateEPCallback(
[this](CGSCCPassManager &PM, OptimizationLevel Level) {
if (Level == OptimizationLevel::O0)
return;
FunctionPassManager FPM;
// Add promote kernel arguments pass to the opt pipeline right before
// infer address spaces which is needed to do actual address space
// rewriting.
if (Level.getSpeedupLevel() > OptimizationLevel::O1.getSpeedupLevel() &&
EnablePromoteKernelArguments)
FPM.addPass(AMDGPUPromoteKernelArgumentsPass());
// Add infer address spaces pass to the opt pipeline after inlining
// but before SROA to increase SROA opportunities.
FPM.addPass(InferAddressSpacesPass());
// This should run after inlining to have any chance of doing
// anything, and before other cleanup optimizations.
FPM.addPass(AMDGPULowerKernelAttributesPass());
if (Level != OptimizationLevel::O0) {
// Promote alloca to vector before SROA and loop unroll. If we
// manage to eliminate allocas before unroll we may choose to unroll
// less.
FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this));
}
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
});
// FIXME: Why is AMDGPUAttributor not in CGSCC?
PB.registerOptimizerLastEPCallback([this](ModulePassManager &MPM,
OptimizationLevel Level,
ThinOrFullLTOPhase Phase) {
if (Level != OptimizationLevel::O0) {
if (!isLTOPreLink(Phase)) {
AMDGPUAttributorOptions Opts;
MPM.addPass(AMDGPUAttributorPass(*this, Opts, Phase));
}
}
});
PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// When we are using -fgpu-rdc, we can only run accelerator code
// selection after linking to prevent, otherwise we end up removing
// potentially reachable symbols that were exported as external in other
// modules.
if (EnableHipStdPar)
PM.addPass(HipStdParAcceleratorCodeSelectionPass());
// We want to support the -lto-partitions=N option as "best effort".
// For that, we need to lower LDS earlier in the pipeline before the
// module is partitioned for codegen.
if (EnableSwLowerLDS)
PM.addPass(AMDGPUSwLowerLDSPass(*this));
if (EnableLowerModuleLDS)
PM.addPass(AMDGPULowerModuleLDSPass(*this));
if (Level != OptimizationLevel::O0) {
// Do we really need internalization in LTO?
if (InternalizeSymbols) {
PM.addPass(InternalizePass(mustPreserveGV));
PM.addPass(GlobalDCEPass());
}
if (EnableAMDGPUAttributor) {
AMDGPUAttributorOptions Opt;
if (HasClosedWorldAssumption)
Opt.IsClosedWorld = true;
PM.addPass(AMDGPUAttributorPass(
*this, Opt, ThinOrFullLTOPhase::FullLTOPostLink));
}
}
if (!NoKernelInfoEndLTO) {
FunctionPassManager FPM;
FPM.addPass(KernelInfoPrinter(this));
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
}
});
PB.registerRegClassFilterParsingCallback(
[](StringRef FilterName) -> RegAllocFilterFunc {
if (FilterName == "sgpr")
return onlyAllocateSGPRs;
if (FilterName == "vgpr")
return onlyAllocateVGPRs;
if (FilterName == "wwm")
return onlyAllocateWWMRegs;
return nullptr;
});
}
int64_t AMDGPUTargetMachine::getNullPointerValue(unsigned AddrSpace) {
return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS)
? -1
: 0;
}
bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
return AMDGPU::isFlatGlobalAddrSpace(SrcAS) &&
AMDGPU::isFlatGlobalAddrSpace(DestAS);
}
unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
const auto *LD = dyn_cast<LoadInst>(V);
if (!LD) // TODO: Handle invariant load like constant.
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
// It must be a generic pointer loaded.
assert(V->getType()->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS);
const auto *Ptr = LD->getPointerOperand();
if (Ptr->getType()->getPointerAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
// For a generic pointer loaded from the constant memory, it could be assumed
// as a global pointer since the constant memory is only populated on the
// host side. As implied by the offload programming model, only global
// pointers could be referenced on the host side.
return AMDGPUAS::GLOBAL_ADDRESS;
}
std::pair<const Value *, unsigned>
AMDGPUTargetMachine::getPredicatedAddrSpace(const Value *V) const {
if (auto *II = dyn_cast<IntrinsicInst>(V)) {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_is_shared:
return std::pair(II->getArgOperand(0), AMDGPUAS::LOCAL_ADDRESS);
case Intrinsic::amdgcn_is_private:
return std::pair(II->getArgOperand(0), AMDGPUAS::PRIVATE_ADDRESS);
default:
break;
}
return std::pair(nullptr, -1);
}
// Check the global pointer predication based on
// (!is_share(p) && !is_private(p)). Note that logic 'and' is commutative and
// the order of 'is_shared' and 'is_private' is not significant.
Value *Ptr;
if (match(
const_cast<Value *>(V),
m_c_And(m_Not(m_Intrinsic<Intrinsic::amdgcn_is_shared>(m_Value(Ptr))),
m_Not(m_Intrinsic<Intrinsic::amdgcn_is_private>(
m_Deferred(Ptr))))))
return std::pair(Ptr, AMDGPUAS::GLOBAL_ADDRESS);
return std::pair(nullptr, -1);
}
unsigned
AMDGPUTargetMachine::getAddressSpaceForPseudoSourceKind(unsigned Kind) const {
switch (Kind) {
case PseudoSourceValue::Stack:
case PseudoSourceValue::FixedStack:
return AMDGPUAS::PRIVATE_ADDRESS;
case PseudoSourceValue::ConstantPool:
case PseudoSourceValue::GOT:
case PseudoSourceValue::JumpTable:
case PseudoSourceValue::GlobalValueCallEntry:
case PseudoSourceValue::ExternalSymbolCallEntry:
return AMDGPUAS::CONSTANT_ADDRESS;
}
return AMDGPUAS::FLAT_ADDRESS;
}
bool AMDGPUTargetMachine::splitModule(
Module &M, unsigned NumParts,
function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback) {
// FIXME(?): Would be better to use an already existing Analysis/PassManager,
// but all current users of this API don't have one ready and would need to
// create one anyway. Let's hide the boilerplate for now to keep it simple.
LoopAnalysisManager LAM;
FunctionAnalysisManager FAM;
CGSCCAnalysisManager CGAM;
ModuleAnalysisManager MAM;
PassBuilder PB(this);
PB.registerModuleAnalyses(MAM);
PB.registerFunctionAnalyses(FAM);
PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
ModulePassManager MPM;
MPM.addPass(AMDGPUSplitModulePass(NumParts, ModuleCallback));
MPM.run(M, MAM);
return true;
}
//===----------------------------------------------------------------------===//
// GCN Target Machine (SI+)
//===----------------------------------------------------------------------===//
GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
CodeGenOptLevel OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
const TargetSubtargetInfo *
GCNTargetMachine::getSubtargetImpl(const Function &F) const {
StringRef GPU = getGPUName(F);
StringRef FS = getFeatureString(F);
SmallString<128> SubtargetKey(GPU);
SubtargetKey.append(FS);
auto &I = SubtargetMap[SubtargetKey];
if (!I) {
// This needs to be done before we create a new subtarget since any
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
return I.get();
}
TargetTransformInfo
GCNTargetMachine::getTargetTransformInfo(const Function &F) const {
return TargetTransformInfo(std::make_unique<GCNTTIImpl>(this, F));
}
Error GCNTargetMachine::buildCodeGenPipeline(
ModulePassManager &MPM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
CodeGenFileType FileType, const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC) {
AMDGPUCodeGenPassBuilder CGPB(*this, Opts, PIC);
return CGPB.buildPipeline(MPM, Out, DwoOut, FileType);
}
ScheduleDAGInstrs *
GCNTargetMachine::createMachineScheduler(MachineSchedContext *C) const {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
Attribute SchedStrategyAttr =
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
StringRef SchedStrategy = SchedStrategyAttr.isValid()
? SchedStrategyAttr.getValueAsString()
: AMDGPUSchedStrategy;
if (SchedStrategy == "max-ilp")
return createGCNMaxILPMachineScheduler(C);
if (SchedStrategy == "max-memory-clause")
return createGCNMaxMemoryClauseMachineScheduler(C);
if (SchedStrategy == "iterative-ilp")
return createIterativeILPMachineScheduler(C);
if (SchedStrategy == "iterative-minreg")
return createMinRegScheduler(C);
if (SchedStrategy == "iterative-maxocc")
return createIterativeGCNMaxOccupancyMachineScheduler(C);
return createGCNMaxOccupancyMachineScheduler(C);
}
ScheduleDAGInstrs *
GCNTargetMachine::createPostMachineScheduler(MachineSchedContext *C) const {
ScheduleDAGMI *DAG =
new GCNPostScheduleDAGMILive(C, std::make_unique<PostGenericScheduler>(C),
/*RemoveKillFlags=*/true);
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation(AMDGPU::SchedulingPhase::PostRA));
if ((EnableVOPD.getNumOccurrences() ||
getOptLevel() >= CodeGenOptLevel::Less) &&
EnableVOPD)
DAG->addMutation(createVOPDPairingMutation());
return DAG;
}
//===----------------------------------------------------------------------===//
// AMDGPU Legacy Pass Setup
//===----------------------------------------------------------------------===//
std::unique_ptr<CSEConfigBase> llvm::AMDGPUPassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
namespace {
class GCNPassConfig final : public AMDGPUPassConfig {
public:
GCNPassConfig(TargetMachine &TM, PassManagerBase &PM)
: AMDGPUPassConfig(TM, PM) {
// It is necessary to know the register usage of the entire call graph. We
// allow calls without EnableAMDGPUFunctionCalls if they are marked
// noinline, so this is always required.
setRequiresCodeGenSCCOrder(true);
substitutePass(&PostRASchedulerID, &PostMachineSchedulerID);
}
GCNTargetMachine &getGCNTargetMachine() const {
return getTM<GCNTargetMachine>();
}
bool addPreISel() override;
void addMachineSSAOptimization() override;
bool addILPOpts() override;
bool addInstSelector() override;
bool addIRTranslator() override;
void addPreLegalizeMachineIR() override;
bool addLegalizeMachineIR() override;
void addPreRegBankSelect() override;
bool addRegBankSelect() override;
void addPreGlobalInstructionSelect() override;
bool addGlobalInstructionSelect() override;
void addFastRegAlloc() override;
void addOptimizedRegAlloc() override;
FunctionPass *createSGPRAllocPass(bool Optimized);
FunctionPass *createVGPRAllocPass(bool Optimized);
FunctionPass *createWWMRegAllocPass(bool Optimized);
FunctionPass *createRegAllocPass(bool Optimized) override;
bool addRegAssignAndRewriteFast() override;
bool addRegAssignAndRewriteOptimized() override;
bool addPreRewrite() override;
void addPostRegAlloc() override;
void addPreSched2() override;
void addPreEmitPass() override;
void addPostBBSections() override;
};
} // end anonymous namespace
AMDGPUPassConfig::AMDGPUPassConfig(TargetMachine &TM, PassManagerBase &PM)
: TargetPassConfig(TM, PM) {
// Exceptions and StackMaps are not supported, so these passes will never do
// anything.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
// Garbage collection is not supported.
disablePass(&GCLoweringID);
disablePass(&ShadowStackGCLoweringID);
}
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
if (getOptLevel() == CodeGenOptLevel::Aggressive)
addPass(createGVNPass());
else
addPass(createEarlyCSEPass());
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
addPass(createLoopDataPrefetchPass());
addPass(createSeparateConstOffsetFromGEPPass());
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
addPass(createStraightLineStrengthReducePass());
// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
// EarlyCSE can reuse.
addEarlyCSEOrGVNPass();
// Run NaryReassociate after EarlyCSE/GVN to be more effective.
addPass(createNaryReassociatePass());
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
addPass(createEarlyCSEPass());
}
void AMDGPUPassConfig::addIRPasses() {
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
addPass(createAMDGPUPrintfRuntimeBinding());
if (LowerCtorDtor)
addPass(createAMDGPUCtorDtorLoweringLegacyPass());
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
if (TM.getTargetTriple().getArch() == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
// Make enqueued block runtime handles externally visible.
addPass(createAMDGPUExportKernelRuntimeHandlesLegacyPass());
// Lower LDS accesses to global memory pass if address sanitizer is enabled.
if (EnableSwLowerLDS)
addPass(createAMDGPUSwLowerLDSLegacyPass(&TM));
// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS) {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(createInferAddressSpacesPass());
// Run atomic optimizer before Atomic Expand
if ((TM.getTargetTriple().isAMDGCN()) &&
(TM.getOptLevel() >= CodeGenOptLevel::Less) &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
}
addPass(createAtomicExpandLegacyPass());
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(createAMDGPUPromoteAlloca());
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
if (EnableAMDGPUAliasAnalysis) {
addPass(createAMDGPUAAWrapperPass());
addPass(createExternalAAWrapperPass([](Pass &P, Function &,
AAResults &AAR) {
if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
AAR.addAAResult(WrapperPass->getResult());
}));
}
if (TM.getTargetTriple().isAMDGCN()) {
// TODO: May want to move later or split into an early and late one.
addPass(createAMDGPUCodeGenPreparePass());
}
// Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
// have expanded.
if (TM.getOptLevel() > CodeGenOptLevel::Less)
addPass(createLICMPass());
}
TargetPassConfig::addIRPasses();
// EarlyCSE is not always strong enough to clean up what LSR produces. For
// example, GVN can combine
//
// %0 = add %a, %b
// %1 = add %b, %a
//
// and
//
// %0 = shl nsw %a, 2
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
if (isPassEnabled(EnableScalarIRPasses))
addEarlyCSEOrGVNPass();
}
void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().isAMDGCN() &&
TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createAMDGPUPreloadKernelArgumentsLegacyPass(TM));
if (TM->getTargetTriple().isAMDGCN() && EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
if (TM->getTargetTriple().isAMDGCN()) {
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
// that it changes by splitting up fat pointers into their components)
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
//
// FIXME: This should ideally be put after the LoadStoreVectorizer.
// However, due to some annoying facts about ResourceUsageAnalysis,
// (especially as exercised in the resource-usage-dead-function test),
// we need all the function passes codegenprepare all the way through
// said resource usage analysis to run on the call graph produced
// before codegenprepare runs (because codegenprepare will knock some
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(createAMDGPULowerBufferFatPointersPass());
// In accordance with the above FIXME, manually force all the
// function-level passes into a CGSCCPassManager.
addPass(new DummyCGSCCPass());
}
TargetPassConfig::addCodeGenPrepare();
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(createLoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can
// cause unexpected behavior for subsequent passes. Placing it
// here seems better that these blocks would get cleaned up by
// UnreachableBlockElim inserted next in the pass flow.
addPass(createLowerSwitchPass());
}
bool AMDGPUPassConfig::addPreISel() {
if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createFlattenCFGPass());
return false;
}
bool AMDGPUPassConfig::addInstSelector() {
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine(), getOptLevel()));
return false;
}
bool AMDGPUPassConfig::addGCPasses() {
// Do nothing. GC is not supported.
return false;
}
//===----------------------------------------------------------------------===//
// GCN Legacy Pass Setup
//===----------------------------------------------------------------------===//
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createSinkingPass());
if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createAMDGPULateCodeGenPrepareLegacyPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
addPass(&AMDGPUUnifyDivergentExitNodesID);
addPass(createFixIrreduciblePass());
addPass(createUnifyLoopExitsPass());
addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
addPass(createAMDGPUAnnotateUniformValuesLegacy());
addPass(createSIAnnotateControlFlowLegacyPass());
// TODO: Move this right after structurizeCFG to avoid extra divergence
// analysis. This depends on stopping SIAnnotateControlFlow from making
// control flow modifications.
addPass(createAMDGPURewriteUndefForPHILegacyPass());
// SDAG requires LCSSA, GlobalISel does not. Disable LCSSA for -global-isel
// with -new-reg-bank-select and without any of the fallback options.
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
addPass(createLCSSAPass());
if (TM->getOptLevel() > CodeGenOptLevel::Less)
addPass(&AMDGPUPerfHintAnalysisLegacyID);
return false;
}
void GCNPassConfig::addMachineSSAOptimization() {
TargetPassConfig::addMachineSSAOptimization();
// We want to fold operands after PeepholeOptimizer has run (or as part of
// it), because it will eliminate extra copies making it easier to fold the
// real source operand. We want to eliminate dead instructions after, so that
// we see fewer uses of the copies. We then need to clean up the dead
// instructions leftover after the operands are folded as well.
//
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsLegacyID);
if (EnableDPPCombine)
addPass(&GCNDPPCombineLegacyID);
addPass(&SILoadStoreOptimizerLegacyID);
if (isPassEnabled(EnableSDWAPeephole)) {
addPass(&SIPeepholeSDWALegacyID);
addPass(&EarlyMachineLICMID);
addPass(&MachineCSELegacyID);
addPass(&SIFoldOperandsLegacyID);
}
addPass(&DeadMachineInstructionElimID);
addPass(createSIShrinkInstructionsLegacyPass());
}
bool GCNPassConfig::addILPOpts() {
if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterLegacyID);
TargetPassConfig::addILPOpts();
return false;
}
bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();
addPass(&SIFixSGPRCopiesLegacyID);
addPass(createSILowerI1CopiesLegacyPass());
return false;
}
bool GCNPassConfig::addIRTranslator() {
addPass(new IRTranslator(getOptLevel()));
return false;
}
void GCNPassConfig::addPreLegalizeMachineIR() {
bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
addPass(new Localizer());
}
bool GCNPassConfig::addLegalizeMachineIR() {
addPass(new Legalizer());
return false;
}
void GCNPassConfig::addPreRegBankSelect() {
bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
}
bool GCNPassConfig::addRegBankSelect() {
if (NewRegBankSelect) {
addPass(createAMDGPURegBankSelectPass());
addPass(createAMDGPURegBankLegalizePass());
} else {
addPass(new RegBankSelect());
}
return false;
}
void GCNPassConfig::addPreGlobalInstructionSelect() {
bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPURegBankCombiner(IsOptNone));
}
bool GCNPassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect(getOptLevel()));
return false;
}
void GCNPassConfig::addFastRegAlloc() {
// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
TargetPassConfig::addFastRegAlloc();
}
void GCNPassConfig::addOptimizedRegAlloc() {
if (EnableDCEInRA)
insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID);
// FIXME: when an instruction has a Killed operand, and the instruction is
// inside a bundle, seems only the BUNDLE instruction appears as the Kills of
// the register in LiveVariables, this would trigger a failure in verifier,
// we should fix it and enable the verifier.
if (OptVGPRLiveRange)
insertPass(&LiveVariablesID, &SIOptimizeVGPRLiveRangeLegacyID);
// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowLegacyID);
if (EnableRewritePartialRegUses)
insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
if (isPassEnabled(EnablePreRAOptimizations))
insertPass(&MachineSchedulerID, &GCNPreRAOptimizationsID);
// Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
// instructions that cause scheduling barriers.
insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
// This is not an essential optimization and it has a noticeable impact on
// compilation time, so we only enable it from O2.
if (TM->getOptLevel() > CodeGenOptLevel::Less)
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
TargetPassConfig::addOptimizedRegAlloc();
}
bool GCNPassConfig::addPreRewrite() {
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
}
FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,
initializeDefaultSGPRRegisterAllocatorOnce);
RegisterRegAlloc::FunctionPassCtor Ctor = SGPRRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();
if (Optimized)
return createGreedyRegisterAllocator(onlyAllocateSGPRs);
return createFastRegisterAllocator(onlyAllocateSGPRs, false);
}
FunctionPass *GCNPassConfig::createVGPRAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultVGPRRegisterAllocatorFlag,
initializeDefaultVGPRRegisterAllocatorOnce);
RegisterRegAlloc::FunctionPassCtor Ctor = VGPRRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();
if (Optimized)
return createGreedyVGPRRegisterAllocator();
return createFastVGPRRegisterAllocator();
}
FunctionPass *GCNPassConfig::createWWMRegAllocPass(bool Optimized) {
// Initialize the global default.
llvm::call_once(InitializeDefaultWWMRegisterAllocatorFlag,
initializeDefaultWWMRegisterAllocatorOnce);
RegisterRegAlloc::FunctionPassCtor Ctor = WWMRegisterRegAlloc::getDefault();
if (Ctor != useDefaultRegisterAllocator)
return Ctor();
if (Optimized)
return createGreedyWWMRegisterAllocator();
return createFastWWMRegisterAllocator();
}
FunctionPass *GCNPassConfig::createRegAllocPass(bool Optimized) {
llvm_unreachable("should not be used");
}
static const char RegAllocOptNotSupportedMessage[] =
"-regalloc not supported with amdgcn. Use -sgpr-regalloc, -wwm-regalloc, "
"and -vgpr-regalloc";
bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
report_fatal_error(RegAllocOptNotSupportedMessage);
addPass(&GCNPreRALongBranchRegID);
addPass(createSGPRAllocPass(false));
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);
// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsLegacyID);
// For allocating other wwm register operands.
addPass(createWWMRegAllocPass(false));
addPass(&SILowerWWMCopiesLegacyID);
addPass(&AMDGPUReserveWWMRegsLegacyID);
// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(false));
return true;
}
bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
if (!usingDefaultRegAlloc())
report_fatal_error(RegAllocOptNotSupportedMessage);
addPass(&GCNPreRALongBranchRegID);
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
// many things rely on the use lists of the physical registers, such as the
// verifier. This is only necessary with allocators which use LiveIntervals,
// since FastRegAlloc does the replacements itself.
addPass(createVirtRegRewriter(false));
// At this point, the sgpr-regalloc has been done and it is good to have the
// stack slot coloring to try to optimize the SGPR spill stack indices before
// attempting the custom SGPR spill lowering.
addPass(&StackSlotColoringID);
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);
// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(&SIPreAllocateWWMRegsLegacyID);
// For allocating other whole wave mode registers.
addPass(createWWMRegAllocPass(true));
addPass(&SILowerWWMCopiesLegacyID);
addPass(createVirtRegRewriter(false));
addPass(&AMDGPUReserveWWMRegsLegacyID);
// For allocating per-thread VGPRs.
addPass(createVGPRAllocPass(true));
addPreRewrite();
addPass(&VirtRegRewriterID);
addPass(&AMDGPUMarkLastScratchLoadID);
return true;
}
void GCNPassConfig::addPostRegAlloc() {
addPass(&SIFixVGPRCopiesID);
if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIOptimizeExecMaskingLegacyID);
TargetPassConfig::addPostRegAlloc();
}
void GCNPassConfig::addPreSched2() {
if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createSIShrinkInstructionsLegacyPass());
addPass(&SIPostRABundlerLegacyID);
}
void GCNPassConfig::addPreEmitPass() {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
addPass(&GCNCreateVOPDID);
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
addPass(createSIModeRegisterPass());
if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIInsertHardClausesID);
addPass(&SILateBranchLoweringPassID);
if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
addPass(createAMDGPUSetWavePriorityPass());
if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
// bottom up, so when we begin to schedule a region we don't know what
// instructions were emitted directly before it.
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
addPass(&PostRAHazardRecognizerID);
addPass(&AMDGPUWaitSGPRHazardsLegacyID);
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertDelayAluID);
addPass(&BranchRelaxationPassID);
}
void GCNPassConfig::addPostBBSections() {
// We run this later to avoid passes like livedebugvalues and BBSections
// having to deal with the apparent multi-entry functions we may generate.
addPass(createAMDGPUPreloadKernArgPrologLegacyPass());
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return new GCNPassConfig(*this, PM);
}
void GCNTargetMachine::registerMachineRegisterInfoCallback(
MachineFunction &MF) const {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MF.getRegInfo().addDelegate(MFI);
}
MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const {
return SIMachineFunctionInfo::create<SIMachineFunctionInfo>(
Allocator, F, static_cast<const GCNSubtarget *>(STI));
}
yaml::MachineFunctionInfo *GCNTargetMachine::createDefaultFuncInfoYAML() const {
return new yaml::SIMachineFunctionInfo();
}
yaml::MachineFunctionInfo *
GCNTargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
return new yaml::SIMachineFunctionInfo(
*MFI, *MF.getSubtarget<GCNSubtarget>().getRegisterInfo(), MF);
}
bool GCNTargetMachine::parseMachineFunctionInfo(
const yaml::MachineFunctionInfo &MFI_, PerFunctionMIParsingState &PFS,
SMDiagnostic &Error, SMRange &SourceRange) const {
const yaml::SIMachineFunctionInfo &YamlMFI =
static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
MachineFunction &MF = PFS.MF;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
return true;
if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
MFI->Occupancy = ST.getOccupancyWithWorkGroupSizes(MF).second;
}
auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
Register TempReg;
if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
SourceRange = RegName.SourceRange;
return true;
}
RegVal = TempReg;
return false;
};
auto parseOptionalRegister = [&](const yaml::StringValue &RegName,
Register &RegVal) {
return !RegName.Value.empty() && parseRegister(RegName, RegVal);
};
if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
return true;
if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
return true;
if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
MFI->LongBranchReservedReg))
return true;
auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
// Create a diagnostic for a the register string literal.
const MemoryBuffer &Buffer =
*PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
RegName.Value.size(), SourceMgr::DK_Error,
"incorrect register class for field", RegName.Value,
{}, {});
SourceRange = RegName.SourceRange;
return true;
};
if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
return true;
if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
!AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
}
if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
!AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
}
if (MFI->StackPtrOffsetReg != AMDGPU::SP_REG &&
!AMDGPU::SGPR_32RegClass.contains(MFI->StackPtrOffsetReg)) {
return diagnoseRegisterClass(YamlMFI.StackPtrOffsetReg);
}
for (const auto &YamlReg : YamlMFI.WWMReservedRegs) {
Register ParsedReg;
if (parseRegister(YamlReg, ParsedReg))
return true;
MFI->reserveWWMRegister(ParsedReg);
}
for (const auto &[_, Info] : PFS.VRegInfosNamed) {
MFI->setFlag(Info->VReg, Info->Flags);
}
for (const auto &[_, Info] : PFS.VRegInfos) {
MFI->setFlag(Info->VReg, Info->Flags);
}
for (const auto &YamlRegStr : YamlMFI.SpillPhysVGPRS) {
Register ParsedReg;
if (parseRegister(YamlRegStr, ParsedReg))
return true;
MFI->SpillPhysVGPRs.push_back(ParsedReg);
}
auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
const TargetRegisterClass &RC,
ArgDescriptor &Arg, unsigned UserSGPRs,
unsigned SystemSGPRs) {
// Skip parsing if it's not present.
if (!A)
return false;
if (A->IsRegister) {
Register Reg;
if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
SourceRange = A->RegisterName.SourceRange;
return true;
}
if (!RC.contains(Reg))
return diagnoseRegisterClass(A->RegisterName);
Arg = ArgDescriptor::createRegister(Reg);
} else
Arg = ArgDescriptor::createStack(A->StackOffset);
// Check and apply the optional mask.
if (A->Mask)
Arg = ArgDescriptor::createArg(Arg, *A->Mask);
MFI->NumUserSGPRs += UserSGPRs;
MFI->NumSystemSGPRs += SystemSGPRs;
return false;
};
if (YamlMFI.ArgInfo &&
(parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
AMDGPU::SGPR_128RegClass,
MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->QueuePtr, AMDGPU::SReg_64RegClass,
MFI->ArgInfo.QueuePtr, 2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->KernargSegmentPtr,
AMDGPU::SReg_64RegClass,
MFI->ArgInfo.KernargSegmentPtr, 2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->DispatchID,
AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchID,
2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->FlatScratchInit,
AMDGPU::SReg_64RegClass,
MFI->ArgInfo.FlatScratchInit, 2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
AMDGPU::SGPR_32RegClass,
MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
AMDGPU::SGPR_32RegClass,
MFI->ArgInfo.LDSKernelId, 0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDY,
AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDY,
0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDZ,
AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDZ,
0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupInfo,
AMDGPU::SGPR_32RegClass,
MFI->ArgInfo.WorkGroupInfo, 0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentWaveByteOffset,
AMDGPU::SGPR_32RegClass,
MFI->ArgInfo.PrivateSegmentWaveByteOffset, 0, 1) ||
parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitArgPtr,
AMDGPU::SReg_64RegClass,
MFI->ArgInfo.ImplicitArgPtr, 0, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->ImplicitBufferPtr,
AMDGPU::SReg_64RegClass,
MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
AMDGPU::VGPR_32RegClass,
MFI->ArgInfo.WorkItemIDX, 0, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
AMDGPU::VGPR_32RegClass,
MFI->ArgInfo.WorkItemIDY, 0, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
AMDGPU::VGPR_32RegClass,
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
return true;
if (ST.hasIEEEMode())
MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
if (ST.hasDX10ClampMode())
MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
// FIXME: Move proper support for denormal-fp-math into base MachineFunction
MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
MFI->Mode.FP32Denormals.Output = YamlMFI.Mode.FP32OutputDenormals
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
MFI->Mode.FP64FP16Denormals.Input = YamlMFI.Mode.FP64FP16InputDenormals
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
MFI->Mode.FP64FP16Denormals.Output = YamlMFI.Mode.FP64FP16OutputDenormals
? DenormalMode::IEEE
: DenormalMode::PreserveSign;
if (YamlMFI.HasInitWholeWave)
MFI->setInitWholeWave();
return false;
}
//===----------------------------------------------------------------------===//
// AMDGPU CodeGen Pass Builder interface.
//===----------------------------------------------------------------------===//
AMDGPUCodeGenPassBuilder::AMDGPUCodeGenPassBuilder(
GCNTargetMachine &TM, const CGPassBuilderOption &Opts,
PassInstrumentationCallbacks *PIC)
: CodeGenPassBuilder(TM, Opts, PIC) {
Opt.MISchedPostRA = true;
Opt.RequiresCodeGenSCCOrder = true;
// Exceptions and StackMaps are not supported, so these passes will never do
// anything.
// Garbage collection is not supported.
disablePass<StackMapLivenessPass, FuncletLayoutPass,
ShadowStackGCLoweringPass>();
}
void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
if (RemoveIncompatibleFunctions && TM.getTargetTriple().isAMDGCN())
addPass(AMDGPURemoveIncompatibleFunctionsPass(TM));
addPass(AMDGPUPrintfRuntimeBindingPass());
if (LowerCtorDtor)
addPass(AMDGPUCtorDtorLoweringPass());
if (isPassEnabled(EnableImageIntrinsicOptimizer))
addPass(AMDGPUImageIntrinsicOptimizerPass(TM));
// This can be disabled by passing ::Disable here or on the command line
// with --expand-variadics-override=disable.
addPass(ExpandVariadicsPass(ExpandVariadicsMode::Lowering));
addPass(AMDGPUAlwaysInlinePass());
addPass(AlwaysInlinerPass());
addPass(AMDGPUExportKernelRuntimeHandlesPass());
if (EnableSwLowerLDS)
addPass(AMDGPUSwLowerLDSPass(TM));
// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS)
addPass(AMDGPULowerModuleLDSPass(TM));
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(InferAddressSpacesPass());
// Run atomic optimizer before Atomic Expand
if (TM.getOptLevel() >= CodeGenOptLevel::Less &&
(AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
addPass(AtomicExpandPass(&TM));
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(AMDGPUPromoteAllocaPass(TM));
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses(addPass);
// TODO: Handle EnableAMDGPUAliasAnalysis
// TODO: May want to move later or split into an early and late one.
addPass(AMDGPUCodeGenPreparePass(TM));
// TODO: LICM
}
Base::addIRPasses(addPass);
// EarlyCSE is not always strong enough to clean up what LSR produces. For
// example, GVN can combine
//
// %0 = add %a, %b
// %1 = add %b, %a
//
// and
//
// %0 = shl nsw %a, 2
// %1 = shl %a, 2
//
// but EarlyCSE can do neither of them.
if (isPassEnabled(EnableScalarIRPasses))
addEarlyCSEOrGVNPass(addPass);
}
void AMDGPUCodeGenPassBuilder::addCodeGenPrepare(AddIRPass &addPass) const {
// AMDGPUAnnotateKernelFeaturesPass is missing here, but it will hopefully be
// deleted soon.
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(AMDGPUPreloadKernelArgumentsPass(TM));
if (EnableLowerKernelArguments)
addPass(AMDGPULowerKernelArgumentsPass(TM));
// This lowering has been placed after codegenprepare to take advantage of
// address mode matching (which is why it isn't put with the LDS lowerings).
// It could be placed anywhere before uniformity annotations (an analysis
// that it changes by splitting up fat pointers into their components)
// but has been put before switch lowering and CFG flattening so that those
// passes can run on the more optimized control flow this pass creates in
// many cases.
//
// FIXME: This should ideally be put after the LoadStoreVectorizer.
// However, due to some annoying facts about ResourceUsageAnalysis,
// (especially as exercised in the resource-usage-dead-function test),
// we need all the function passes codegenprepare all the way through
// said resource usage analysis to run on the call graph produced
// before codegenprepare runs (because codegenprepare will knock some
// nodes out of the graph, which leads to function-level passes not
// being run on them, which causes crashes in the resource usage analysis).
addPass(AMDGPULowerBufferFatPointersPass(TM));
Base::addCodeGenPrepare(addPass);
if (isPassEnabled(EnableLoadStoreVectorizer))
addPass(LoadStoreVectorizerPass());
// LowerSwitch pass may introduce unreachable blocks that can cause unexpected
// behavior for subsequent passes. Placing it here seems better that these
// blocks would get cleaned up by UnreachableBlockElim inserted next in the
// pass flow.
addPass(LowerSwitchPass());
}
void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(FlattenCFGPass());
addPass(SinkingPass());
addPass(AMDGPULateCodeGenPreparePass(TM));
}
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
// regions formed by them.
addPass(AMDGPUUnifyDivergentExitNodesPass());
addPass(FixIrreduciblePass());
addPass(UnifyLoopExitsPass());
addPass(StructurizeCFGPass(/*SkipUniformRegions=*/false));
addPass(AMDGPUAnnotateUniformValuesPass());
addPass(SIAnnotateControlFlowPass(TM));
// TODO: Move this right after structurizeCFG to avoid extra divergence
// analysis. This depends on stopping SIAnnotateControlFlow from making
// control flow modifications.
addPass(AMDGPURewriteUndefForPHIPass());
if (!getCGPassBuilderOption().EnableGlobalISelOption ||
!isGlobalISelAbortEnabled() || !NewRegBankSelect)
addPass(LCSSAPass());
if (TM.getOptLevel() > CodeGenOptLevel::Less)
addPass(AMDGPUPerfHintAnalysisPass(TM));
// FIXME: Why isn't this queried as required from AMDGPUISelDAGToDAG, and why
// isn't this in addInstSelector?
addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
}
void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
if (EnableEarlyIfConversion)
addPass(EarlyIfConverterPass());
Base::addILPOpts(addPass);
}
void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
CreateMCStreamer) const {
// TODO: Add AsmPrinter.
}
Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
addPass(AMDGPUISelDAGToDAGPass(TM));
addPass(SIFixSGPRCopiesPass());
addPass(SILowerI1CopiesPass());
return Error::success();
}
void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
if (EnableRegReassign) {
addPass(GCNNSAReassignPass());
}
}
void AMDGPUCodeGenPassBuilder::addMachineSSAOptimization(
AddMachinePass &addPass) const {
Base::addMachineSSAOptimization(addPass);
addPass(SIFoldOperandsPass());
if (EnableDPPCombine) {
addPass(GCNDPPCombinePass());
}
addPass(SILoadStoreOptimizerPass());
if (isPassEnabled(EnableSDWAPeephole)) {
addPass(SIPeepholeSDWAPass());
addPass(EarlyMachineLICMPass());
addPass(MachineCSEPass());
addPass(SIFoldOperandsPass());
}
addPass(DeadMachineInstructionElimPass());
addPass(SIShrinkInstructionsPass());
}
Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
AddMachinePass &addPass) const {
// TODO: Check --regalloc-npm option
addPass(GCNPreRALongBranchRegPass());
addPass(RAGreedyPass({onlyAllocateSGPRs, "sgpr"}));
// Commit allocated register changes. This is mostly necessary because too
// many things rely on the use lists of the physical registers, such as the
// verifier. This is only necessary with allocators which use LiveIntervals,
// since FastRegAlloc does the replacements itself.
addPass(VirtRegRewriterPass(false));
// At this point, the sgpr-regalloc has been done and it is good to have the
// stack slot coloring to try to optimize the SGPR spill stack indices before
// attempting the custom SGPR spill lowering.
addPass(StackSlotColoringPass());
// Equivalent of PEI for SGPRs.
addPass(SILowerSGPRSpillsPass());
// To Allocate wwm registers used in whole quad mode operations (for shaders).
addPass(SIPreAllocateWWMRegsPass());
// For allocating other wwm register operands.
// addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::WWM);
addPass(RAGreedyPass({onlyAllocateWWMRegs, "wwm"}));
addPass(SILowerWWMCopiesPass());
addPass(VirtRegRewriterPass(false));
addPass(AMDGPUReserveWWMRegsPass());
// For allocating per-thread VGPRs.
// addRegAlloc<RAGreedyPass>(addPass, RegAllocPhase::VGPR);
addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
addPreRewrite(addPass);
addPass(VirtRegRewriterPass(true));
// TODO: addPass(AMDGPUMarkLastScratchLoadPass());
return Error::success();
}
void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const {
addPass(SIFixVGPRCopiesPass());
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(SIOptimizeExecMaskingPass());
Base::addPostRegAlloc(addPass);
}
void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const {
if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) {
addPass(GCNCreateVOPDPass());
}
addPass(SIMemoryLegalizerPass());
addPass(SIInsertWaitcntsPass());
// TODO: addPass(SIModeRegisterPass());
if (TM.getOptLevel() > CodeGenOptLevel::None) {
// TODO: addPass(SIInsertHardClausesPass());
}
addPass(SILateBranchLoweringPass());
if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
addPass(AMDGPUSetWavePriorityPass());
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(SIPreEmitPeepholePass());
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
// are multiple scheduling regions in a basic block, the regions are scheduled
// bottom up, so when we begin to schedule a region we don't know what
// instructions were emitted directly before it.
//
// Here we add a stand-alone hazard recognizer pass which can handle all
// cases.
addPass(PostRAHazardRecognizerPass());
addPass(AMDGPUWaitSGPRHazardsPass());
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) {
addPass(AMDGPUInsertDelayAluPass());
}
addPass(BranchRelaxationPass());
}
bool AMDGPUCodeGenPassBuilder::isPassEnabled(const cl::opt<bool> &Opt,
CodeGenOptLevel Level) const {
if (Opt.getNumOccurrences())
return Opt;
if (TM.getOptLevel() < Level)
return false;
return Opt;
}
void AMDGPUCodeGenPassBuilder::addEarlyCSEOrGVNPass(AddIRPass &addPass) const {
if (TM.getOptLevel() == CodeGenOptLevel::Aggressive)
addPass(GVNPass());
else
addPass(EarlyCSEPass());
}
void AMDGPUCodeGenPassBuilder::addStraightLineScalarOptimizationPasses(
AddIRPass &addPass) const {
if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
addPass(LoopDataPrefetchPass());
addPass(SeparateConstOffsetFromGEPPass());
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
addPass(StraightLineStrengthReducePass());
// SeparateConstOffsetFromGEP and SLSR creates common expressions which GVN or
// EarlyCSE can reuse.
addEarlyCSEOrGVNPass(addPass);
// Run NaryReassociate after EarlyCSE/GVN to be more effective.
addPass(NaryReassociatePass());
// NaryReassociate on GEPs creates redundant common expressions, so run
// EarlyCSE after it.
addPass(EarlyCSEPass());
}