[AMDGPU] Add MaxMemoryClauseSchedStrategy (#114957)
Also expose an option to choose custom scheduler strategy: amdgpu-sched-strategy={max-ilp|max-memory-clause} This can be set through either function attribute or command line option. The major behaviors of the max memory clause schedule strategy includes: 1. Try to cluster memory instructions more aggressively. 2. Try to schedule long latency load earlier than short latency instruction. I tested locally against about 470 real shaders and got the perf changes (only count perf changes over +/-10%): About 15 shaders improved 10%~40%. Only 3 shaders drops ~10%. (This was tested together with another change which increases the maximum clustered dword from 8 to 32). I will make another change to make that threshold configurable.
This commit is contained in:
parent
a4506bb340
commit
b33c807b39
@ -428,10 +428,10 @@ static cl::opt<bool>
|
||||
cl::desc("Enable loop data prefetch on AMDGPU"),
|
||||
cl::Hidden, cl::init(false));
|
||||
|
||||
static cl::opt<bool> EnableMaxIlpSchedStrategy(
|
||||
"amdgpu-enable-max-ilp-scheduling-strategy",
|
||||
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
|
||||
cl::Hidden, cl::init(false));
|
||||
static cl::opt<std::string>
|
||||
AMDGPUSchedStrategy("amdgpu-sched-strategy",
|
||||
cl::desc("Select custom AMDGPU scheduling strategy."),
|
||||
cl::Hidden, cl::init(""));
|
||||
|
||||
static cl::opt<bool> EnableRewritePartialRegUses(
|
||||
"amdgpu-enable-rewrite-partial-reg-uses",
|
||||
@ -567,6 +567,18 @@ createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
|
||||
return DAG;
|
||||
}
|
||||
|
||||
static ScheduleDAGInstrs *
|
||||
createGCNMaxMemoryClauseMachineScheduler(MachineSchedContext *C) {
|
||||
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
|
||||
ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
|
||||
C, std::make_unique<GCNMaxMemoryClauseSchedStrategy>(C));
|
||||
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
if (ST.shouldClusterStores())
|
||||
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
|
||||
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
|
||||
return DAG;
|
||||
}
|
||||
|
||||
static ScheduleDAGInstrs *
|
||||
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
|
||||
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
|
||||
@ -607,6 +619,10 @@ static MachineSchedRegistry
|
||||
GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
|
||||
createGCNMaxILPMachineScheduler);
|
||||
|
||||
static MachineSchedRegistry GCNMaxMemoryClauseSchedRegistry(
|
||||
"gcn-max-memory-clause", "Run GCN scheduler to maximize memory clause",
|
||||
createGCNMaxMemoryClauseMachineScheduler);
|
||||
|
||||
static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
|
||||
"gcn-iterative-max-occupancy-experimental",
|
||||
"Run GCN scheduler to maximize occupancy (experimental)",
|
||||
@ -1294,9 +1310,18 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
|
||||
if (ST.enableSIScheduler())
|
||||
return createSIMachineScheduler(C);
|
||||
|
||||
if (EnableMaxIlpSchedStrategy)
|
||||
Attribute SchedStrategyAttr =
|
||||
C->MF->getFunction().getFnAttribute("amdgpu-sched-strategy");
|
||||
StringRef SchedStrategy = SchedStrategyAttr.isValid()
|
||||
? SchedStrategyAttr.getValueAsString()
|
||||
: AMDGPUSchedStrategy;
|
||||
|
||||
if (SchedStrategy == "max-ilp")
|
||||
return createGCNMaxILPMachineScheduler(C);
|
||||
|
||||
if (SchedStrategy == "max-memory-clause")
|
||||
return createGCNMaxMemoryClauseMachineScheduler(C);
|
||||
|
||||
return createGCNMaxOccupancyMachineScheduler(C);
|
||||
}
|
||||
|
||||
|
@ -615,6 +615,138 @@ bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
|
||||
return false;
|
||||
}
|
||||
|
||||
GCNMaxMemoryClauseSchedStrategy::GCNMaxMemoryClauseSchedStrategy(
|
||||
const MachineSchedContext *C)
|
||||
: GCNSchedStrategy(C) {
|
||||
SchedStages.push_back(GCNSchedStageID::MemoryClauseInitialSchedule);
|
||||
}
|
||||
|
||||
/// GCNMaxMemoryClauseSchedStrategy tries best to clause memory instructions as
|
||||
/// much as possible. This is achieved by:
|
||||
// 1. Prioritize clustered operations before stall latency heuristic.
|
||||
// 2. Prioritize long-latency-load before stall latency heuristic.
|
||||
///
|
||||
/// \param Cand provides the policy and current best candidate.
|
||||
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
|
||||
/// \param Zone describes the scheduled zone that we are extending, or nullptr
|
||||
/// if Cand is from a different zone than TryCand.
|
||||
/// \return \c true if TryCand is better than Cand (Reason is NOT NoCand)
|
||||
bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
|
||||
SchedCandidate &TryCand,
|
||||
SchedBoundary *Zone) const {
|
||||
// Initialize the candidate if needed.
|
||||
if (!Cand.isValid()) {
|
||||
TryCand.Reason = NodeOrder;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Bias PhysReg Defs and copies to their uses and defined respectively.
|
||||
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
|
||||
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
if (DAG->isTrackingPressure()) {
|
||||
// Avoid exceeding the target's limit.
|
||||
if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
|
||||
RegExcess, TRI, DAG->MF))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Avoid increasing the max critical pressure in the scheduled region.
|
||||
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
|
||||
TryCand, Cand, RegCritical, TRI, DAG->MF))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
|
||||
// MaxMemoryClause-specific: We prioritize clustered instructions as we would
|
||||
// get more benefit from clausing these memory instructions.
|
||||
const SUnit *CandNextClusterSU =
|
||||
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
const SUnit *TryCandNextClusterSU =
|
||||
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
|
||||
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// We only compare a subset of features when comparing nodes between
|
||||
// Top and Bottom boundary. Some properties are simply incomparable, in many
|
||||
// other instances we should only override the other boundary if something
|
||||
// is a clear good pick on one boundary. Skip heuristics that are more
|
||||
// "tie-breaking" in nature.
|
||||
bool SameBoundary = Zone != nullptr;
|
||||
if (SameBoundary) {
|
||||
// For loops that are acyclic path limited, aggressively schedule for
|
||||
// latency. Within an single cycle, whenever CurrMOps > 0, allow normal
|
||||
// heuristics to take precedence.
|
||||
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
|
||||
tryLatency(TryCand, Cand, *Zone))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// MaxMemoryClause-specific: Prioritize long latency memory load
|
||||
// instructions in top-bottom order to hide more latency. The mayLoad check
|
||||
// is used to exclude store-like instructions, which we do not want to
|
||||
// scheduler them too early.
|
||||
bool TryMayLoad =
|
||||
TryCand.SU->isInstr() && TryCand.SU->getInstr()->mayLoad();
|
||||
bool CandMayLoad = Cand.SU->isInstr() && Cand.SU->getInstr()->mayLoad();
|
||||
|
||||
if (TryMayLoad || CandMayLoad) {
|
||||
bool TryLongLatency =
|
||||
TryCand.SU->Latency > 10 * Cand.SU->Latency && TryMayLoad;
|
||||
bool CandLongLatency =
|
||||
10 * TryCand.SU->Latency < Cand.SU->Latency && CandMayLoad;
|
||||
|
||||
if (tryGreater(Zone->isTop() ? TryLongLatency : CandLongLatency,
|
||||
Zone->isTop() ? CandLongLatency : TryLongLatency, TryCand,
|
||||
Cand, Stall))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
// Prioritize instructions that read unbuffered resources by stall cycles.
|
||||
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
|
||||
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
|
||||
if (SameBoundary) {
|
||||
// Weak edges are for clustering and other constraints.
|
||||
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
|
||||
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
|
||||
return TryCand.Reason != NoCand;
|
||||
}
|
||||
|
||||
// Avoid increasing the max pressure of the entire region.
|
||||
if (DAG->isTrackingPressure() &&
|
||||
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
|
||||
Cand, RegMax, TRI, DAG->MF))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
if (SameBoundary) {
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
||||
TryCand, Cand, ResourceReduce))
|
||||
return TryCand.Reason != NoCand;
|
||||
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
||||
Cand.ResDelta.DemandedResources, TryCand, Cand,
|
||||
ResourceDemand))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Avoid serializing long latency dependence chains.
|
||||
// For acyclic path limited loops, latency was already checked above.
|
||||
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
|
||||
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
|
||||
return TryCand.Reason != NoCand;
|
||||
|
||||
// Fall through to original instruction order.
|
||||
if (Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum)) {
|
||||
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
|
||||
TryCand.Reason = NodeOrder;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
GCNScheduleDAGMILive::GCNScheduleDAGMILive(
|
||||
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
|
||||
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
|
||||
@ -644,6 +776,9 @@ GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
|
||||
return std::make_unique<PreRARematStage>(SchedStageID, *this);
|
||||
case GCNSchedStageID::ILPInitialSchedule:
|
||||
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
|
||||
case GCNSchedStageID::MemoryClauseInitialSchedule:
|
||||
return std::make_unique<MemoryClauseInitialScheduleStage>(SchedStageID,
|
||||
*this);
|
||||
}
|
||||
|
||||
llvm_unreachable("Unknown SchedStageID.");
|
||||
@ -869,6 +1004,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
|
||||
case GCNSchedStageID::ILPInitialSchedule:
|
||||
OS << "Max ILP Initial Schedule";
|
||||
break;
|
||||
case GCNSchedStageID::MemoryClauseInitialSchedule:
|
||||
OS << "Max memory clause Initial Schedule";
|
||||
break;
|
||||
}
|
||||
|
||||
return OS;
|
||||
@ -1088,7 +1226,8 @@ void GCNSchedStage::setupNewBlock() {
|
||||
// Get real RP for the region if it hasn't be calculated before. After the
|
||||
// initial schedule stage real RP will be collected after scheduling.
|
||||
if (StageID == GCNSchedStageID::OccInitialSchedule ||
|
||||
StageID == GCNSchedStageID::ILPInitialSchedule)
|
||||
StageID == GCNSchedStageID::ILPInitialSchedule ||
|
||||
StageID == GCNSchedStageID::MemoryClauseInitialSchedule)
|
||||
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
|
||||
}
|
||||
|
||||
@ -1389,6 +1528,11 @@ bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool MemoryClauseInitialScheduleStage::shouldRevertScheduling(
|
||||
unsigned WavesAfter) {
|
||||
return mayCauseSpilling(WavesAfter);
|
||||
}
|
||||
|
||||
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
|
||||
if (WavesAfter <= MFI.getMinWavesPerEU() && isRegionWithExcessRP() &&
|
||||
!PressureAfter.less(MF, PressureBefore)) {
|
||||
|
@ -29,7 +29,8 @@ enum class GCNSchedStageID : unsigned {
|
||||
UnclusteredHighRPReschedule = 1,
|
||||
ClusteredLowOccupancyReschedule = 2,
|
||||
PreRARematerialize = 3,
|
||||
ILPInitialSchedule = 4
|
||||
ILPInitialSchedule = 4,
|
||||
MemoryClauseInitialSchedule = 5
|
||||
};
|
||||
|
||||
#ifndef NDEBUG
|
||||
@ -149,6 +150,17 @@ public:
|
||||
GCNMaxILPSchedStrategy(const MachineSchedContext *C);
|
||||
};
|
||||
|
||||
/// The goal of this scheduling strategy is to maximize memory clause for a
|
||||
/// single wave.
|
||||
class GCNMaxMemoryClauseSchedStrategy final : public GCNSchedStrategy {
|
||||
protected:
|
||||
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
|
||||
SchedBoundary *Zone) const override;
|
||||
|
||||
public:
|
||||
GCNMaxMemoryClauseSchedStrategy(const MachineSchedContext *C);
|
||||
};
|
||||
|
||||
class ScheduleMetrics {
|
||||
unsigned ScheduleLength;
|
||||
unsigned BubbleCycles;
|
||||
@ -463,6 +475,15 @@ public:
|
||||
: GCNSchedStage(StageID, DAG) {}
|
||||
};
|
||||
|
||||
class MemoryClauseInitialScheduleStage : public GCNSchedStage {
|
||||
public:
|
||||
bool shouldRevertScheduling(unsigned WavesAfter) override;
|
||||
|
||||
MemoryClauseInitialScheduleStage(GCNSchedStageID StageID,
|
||||
GCNScheduleDAGMILive &DAG)
|
||||
: GCNSchedStage(StageID, DAG) {}
|
||||
};
|
||||
|
||||
class GCNPostScheduleDAGMILive final : public ScheduleDAGMI {
|
||||
private:
|
||||
std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
|
||||
|
455
llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
Normal file
455
llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
Normal file
@ -0,0 +1,455 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
|
||||
|
||||
define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
|
||||
; GFX11-LABEL: group_image_sample:
|
||||
; GFX11: ; %bb.0: ; %.entry
|
||||
; GFX11-NEXT: s_mov_b32 s24, exec_lo
|
||||
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX11-NEXT: s_mov_b32 m0, s4
|
||||
; GFX11-NEXT: s_getpc_b64 s[4:5]
|
||||
; GFX11-NEXT: s_mov_b32 s0, s1
|
||||
; GFX11-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-NEXT: s_mov_b32 s1, s5
|
||||
; GFX11-NEXT: s_mov_b32 s3, s5
|
||||
; GFX11-NEXT: s_mov_b32 s7, s5
|
||||
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0
|
||||
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
|
||||
; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0
|
||||
; GFX11-NEXT: s_mov_b32 s16, exec_lo
|
||||
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
|
||||
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
|
||||
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
|
||||
; GFX11-NEXT: s_mov_b32 exec_lo, s16
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x10
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x20
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x30
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x40
|
||||
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
|
||||
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
||||
; GFX11-NEXT: v_interp_p2_f32 v0, v3, v1, v0 wait_exp:7
|
||||
; GFX11-NEXT: v_interp_p2_f32 v1, v2, v1, v4 wait_exp:7
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, s16, v0 :: v_dual_add_f32 v5, s17, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v12, s20, v0 :: v_dual_add_f32 v13, s21, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v8, s18, v0 :: v_dual_add_f32 v9, s19, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v16, s22, v0 :: v_dual_add_f32 v17, s23, v1
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: image_sample v[4:7], v[4:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[12:15], v[12:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x50
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x60
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0x70
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x80
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_add_f32 v20, s16, v0 :: v_dual_add_f32 v21, s17, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v28, s20, v0 :: v_dual_add_f32 v29, s21, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v24, s18, v0 :: v_dual_add_f32 v25, s19, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v32, s22, v0 :: v_dual_add_f32 v33, s23, v1
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: image_sample v[20:23], v[20:21], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[28:31], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0x90
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xa0
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xb0
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xc0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_add_f32 v36, s16, v0 :: v_dual_add_f32 v37, s17, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v44, s20, v0 :: v_dual_add_f32 v45, s21, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v40, s18, v0 :: v_dual_add_f32 v41, s19, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v48, s22, v0 :: v_dual_add_f32 v49, s23, v1
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: image_sample v[36:39], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[44:47], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_clause 0x3
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[16:17], s[12:15], 0xd0
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xe0
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xf0
|
||||
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_dual_add_f32 v52, s16, v0 :: v_dual_add_f32 v53, s17, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v56, s18, v0 :: v_dual_add_f32 v57, s19, v1
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: image_sample v[52:55], v[52:53], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: v_dual_add_f32 v60, s20, v0 :: v_dual_add_f32 v61, s21, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, s12, v0 :: v_dual_add_f32 v1, s13, v1
|
||||
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s24
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: image_sample v[60:63], v[60:61], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: image_sample v[64:67], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(14)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v8, v4 :: v_dual_add_f32 v1, v9, v5
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v10, v6 :: v_dual_add_f32 v5, v11, v7
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(13)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v12, v0 :: v_dual_add_f32 v1, v13, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v14, v4 :: v_dual_add_f32 v5, v15, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(12)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v16, v0 :: v_dual_add_f32 v1, v17, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v18, v4 :: v_dual_add_f32 v5, v19, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(11)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v20, v0 :: v_dual_add_f32 v1, v21, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v22, v4 :: v_dual_add_f32 v5, v23, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(10)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v24, v0 :: v_dual_add_f32 v1, v25, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v26, v4 :: v_dual_add_f32 v5, v27, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(9)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v28, v0 :: v_dual_add_f32 v1, v29, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v30, v4 :: v_dual_add_f32 v5, v31, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(8)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v32, v0 :: v_dual_add_f32 v1, v33, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v34, v4 :: v_dual_add_f32 v5, v35, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(7)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v36, v0 :: v_dual_add_f32 v1, v37, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v38, v4 :: v_dual_add_f32 v5, v39, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(6)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v40, v0 :: v_dual_add_f32 v1, v41, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v42, v4 :: v_dual_add_f32 v5, v43, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(5)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v44, v0 :: v_dual_add_f32 v1, v45, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v46, v4 :: v_dual_add_f32 v5, v47, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(4)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v48, v0 :: v_dual_add_f32 v1, v49, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v50, v4 :: v_dual_add_f32 v5, v51, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(3)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v52, v0 :: v_dual_add_f32 v1, v53, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v54, v4 :: v_dual_add_f32 v5, v55, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(2)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v56, v0 :: v_dual_add_f32 v1, v57, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v58, v4 :: v_dual_add_f32 v5, v59, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v60, v0 :: v_dual_add_f32 v1, v61, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v62, v4 :: v_dual_add_f32 v5, v63, v5
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_dual_add_f32 v0, v64, v0 :: v_dual_add_f32 v1, v65, v1
|
||||
; GFX11-NEXT: v_dual_add_f32 v4, v66, v4 :: v_dual_add_f32 v5, v67, v5
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
||||
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
|
||||
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5
|
||||
; GFX11-NEXT: exp mrt0 v0, v1, off, off done
|
||||
; GFX11-NEXT: s_endpgm
|
||||
.entry:
|
||||
%i = call i64 @llvm.amdgcn.s.getpc()
|
||||
%i1 = and i64 %i, -4294967296
|
||||
%i2 = zext i32 %userdata6 to i64
|
||||
%i3 = or disjoint i64 %i1, %i2
|
||||
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
|
||||
%i5 = load <4 x i32>, ptr addrspace(4) %i4, align 16
|
||||
%i6 = zext i32 %userdata7 to i64
|
||||
%i7 = or disjoint i64 %i1, %i6
|
||||
%i8 = inttoptr i64 %i7 to ptr addrspace(4)
|
||||
%i9 = load <4 x i32>, ptr addrspace(4) %i8, align 4, !invariant.load !0
|
||||
%i10 = zext i32 %userdata8 to i64
|
||||
%i11 = or disjoint i64 %i1, %i10
|
||||
%i12 = inttoptr i64 %i11 to ptr addrspace(4)
|
||||
%i13 = load <8 x i32>, ptr addrspace(4) %i12, align 4, !invariant.load !0
|
||||
%i14 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %PrimMask)
|
||||
%PerspInterpCenter.i1 = extractelement <2 x float> %PerspInterpCenter, i64 1
|
||||
%PerspInterpCenter.i0 = extractelement <2 x float> %PerspInterpCenter, i64 0
|
||||
%i15 = call float @llvm.amdgcn.interp.inreg.p10(float %i14, float %PerspInterpCenter.i0, float %i14)
|
||||
%i16 = call float @llvm.amdgcn.interp.inreg.p2(float %i14, float %PerspInterpCenter.i1, float %i15)
|
||||
%i17 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %PrimMask)
|
||||
%i18 = call float @llvm.amdgcn.interp.inreg.p10(float %i17, float %PerspInterpCenter.i0, float %i17)
|
||||
%i19 = call float @llvm.amdgcn.interp.inreg.p2(float %i17, float %PerspInterpCenter.i1, float %i18)
|
||||
%i20 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 16, i32 0), !invariant.load !0
|
||||
%i21 = shufflevector <2 x i32> %i20, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i22 = bitcast <4 x i32> %i21 to <4 x float>
|
||||
%.i0 = extractelement <4 x float> %i22, i64 0
|
||||
%.i1 = extractelement <4 x float> %i22, i64 1
|
||||
%.i03 = fadd reassoc nnan nsz arcp contract afn float %.i0, %i19
|
||||
%.i14 = fadd reassoc nnan nsz arcp contract afn float %.i1, %i16
|
||||
%i23 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i03, float %.i14, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i010 = extractelement <4 x float> %i23, i64 0
|
||||
%.i113 = extractelement <4 x float> %i23, i64 1
|
||||
%.i215 = extractelement <4 x float> %i23, i64 2
|
||||
%.i317 = extractelement <4 x float> %i23, i64 3
|
||||
%i24 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 32, i32 0), !invariant.load !0
|
||||
%i25 = shufflevector <2 x i32> %i24, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i26 = bitcast <4 x i32> %i25 to <4 x float>
|
||||
%.i05 = extractelement <4 x float> %i26, i64 0
|
||||
%.i16 = extractelement <4 x float> %i26, i64 1
|
||||
%.i07 = fadd reassoc nnan nsz arcp contract afn float %.i05, %i19
|
||||
%.i18 = fadd reassoc nnan nsz arcp contract afn float %.i16, %i16
|
||||
%i27 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i07, float %.i18, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i09 = extractelement <4 x float> %i27, i64 0
|
||||
%.i011 = fadd reassoc nnan nsz arcp contract afn float %.i09, %.i010
|
||||
%.i112 = extractelement <4 x float> %i27, i64 1
|
||||
%.i114 = fadd reassoc nnan nsz arcp contract afn float %.i112, %.i113
|
||||
%.i2 = extractelement <4 x float> %i27, i64 2
|
||||
%.i216 = fadd reassoc nnan nsz arcp contract afn float %.i2, %.i215
|
||||
%.i3 = extractelement <4 x float> %i27, i64 3
|
||||
%.i318 = fadd reassoc nnan nsz arcp contract afn float %.i3, %.i317
|
||||
%i28 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 48, i32 0), !invariant.load !0
|
||||
%i29 = shufflevector <2 x i32> %i28, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i30 = bitcast <4 x i32> %i29 to <4 x float>
|
||||
%.i019 = extractelement <4 x float> %i30, i64 0
|
||||
%.i120 = extractelement <4 x float> %i30, i64 1
|
||||
%.i021 = fadd reassoc nnan nsz arcp contract afn float %.i019, %i19
|
||||
%.i122 = fadd reassoc nnan nsz arcp contract afn float %.i120, %i16
|
||||
%i31 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i021, float %.i122, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i023 = extractelement <4 x float> %i31, i64 0
|
||||
%.i024 = fadd reassoc nnan nsz arcp contract afn float %.i023, %.i011
|
||||
%.i125 = extractelement <4 x float> %i31, i64 1
|
||||
%.i126 = fadd reassoc nnan nsz arcp contract afn float %.i125, %.i114
|
||||
%.i227 = extractelement <4 x float> %i31, i64 2
|
||||
%.i228 = fadd reassoc nnan nsz arcp contract afn float %.i227, %.i216
|
||||
%.i329 = extractelement <4 x float> %i31, i64 3
|
||||
%.i330 = fadd reassoc nnan nsz arcp contract afn float %.i329, %.i318
|
||||
%i32 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 64, i32 0), !invariant.load !0
|
||||
%i33 = shufflevector <2 x i32> %i32, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i34 = bitcast <4 x i32> %i33 to <4 x float>
|
||||
%.i031 = extractelement <4 x float> %i34, i64 0
|
||||
%.i132 = extractelement <4 x float> %i34, i64 1
|
||||
%.i033 = fadd reassoc nnan nsz arcp contract afn float %.i031, %i19
|
||||
%.i134 = fadd reassoc nnan nsz arcp contract afn float %.i132, %i16
|
||||
%i35 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i033, float %.i134, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i035 = extractelement <4 x float> %i35, i64 0
|
||||
%.i036 = fadd reassoc nnan nsz arcp contract afn float %.i035, %.i024
|
||||
%.i137 = extractelement <4 x float> %i35, i64 1
|
||||
%.i138 = fadd reassoc nnan nsz arcp contract afn float %.i137, %.i126
|
||||
%.i239 = extractelement <4 x float> %i35, i64 2
|
||||
%.i240 = fadd reassoc nnan nsz arcp contract afn float %.i239, %.i228
|
||||
%.i341 = extractelement <4 x float> %i35, i64 3
|
||||
%.i342 = fadd reassoc nnan nsz arcp contract afn float %.i341, %.i330
|
||||
%i36 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 80, i32 0), !invariant.load !0
|
||||
%i37 = shufflevector <2 x i32> %i36, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i38 = bitcast <4 x i32> %i37 to <4 x float>
|
||||
%.i043 = extractelement <4 x float> %i38, i64 0
|
||||
%.i144 = extractelement <4 x float> %i38, i64 1
|
||||
%.i045 = fadd reassoc nnan nsz arcp contract afn float %.i043, %i19
|
||||
%.i146 = fadd reassoc nnan nsz arcp contract afn float %.i144, %i16
|
||||
%i39 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i045, float %.i146, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i047 = extractelement <4 x float> %i39, i64 0
|
||||
%.i048 = fadd reassoc nnan nsz arcp contract afn float %.i047, %.i036
|
||||
%.i149 = extractelement <4 x float> %i39, i64 1
|
||||
%.i150 = fadd reassoc nnan nsz arcp contract afn float %.i149, %.i138
|
||||
%.i251 = extractelement <4 x float> %i39, i64 2
|
||||
%.i252 = fadd reassoc nnan nsz arcp contract afn float %.i251, %.i240
|
||||
%.i353 = extractelement <4 x float> %i39, i64 3
|
||||
%.i354 = fadd reassoc nnan nsz arcp contract afn float %.i353, %.i342
|
||||
%i40 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 96, i32 0), !invariant.load !0
|
||||
%i41 = shufflevector <2 x i32> %i40, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i42 = bitcast <4 x i32> %i41 to <4 x float>
|
||||
%.i055 = extractelement <4 x float> %i42, i64 0
|
||||
%.i156 = extractelement <4 x float> %i42, i64 1
|
||||
%.i057 = fadd reassoc nnan nsz arcp contract afn float %.i055, %i19
|
||||
%.i158 = fadd reassoc nnan nsz arcp contract afn float %.i156, %i16
|
||||
%i43 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i057, float %.i158, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i059 = extractelement <4 x float> %i43, i64 0
|
||||
%.i060 = fadd reassoc nnan nsz arcp contract afn float %.i059, %.i048
|
||||
%.i161 = extractelement <4 x float> %i43, i64 1
|
||||
%.i162 = fadd reassoc nnan nsz arcp contract afn float %.i161, %.i150
|
||||
%.i263 = extractelement <4 x float> %i43, i64 2
|
||||
%.i264 = fadd reassoc nnan nsz arcp contract afn float %.i263, %.i252
|
||||
%.i365 = extractelement <4 x float> %i43, i64 3
|
||||
%.i366 = fadd reassoc nnan nsz arcp contract afn float %.i365, %.i354
|
||||
%i44 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 112, i32 0), !invariant.load !0
|
||||
%i45 = shufflevector <2 x i32> %i44, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i46 = bitcast <4 x i32> %i45 to <4 x float>
|
||||
%.i067 = extractelement <4 x float> %i46, i64 0
|
||||
%.i168 = extractelement <4 x float> %i46, i64 1
|
||||
%.i069 = fadd reassoc nnan nsz arcp contract afn float %.i067, %i19
|
||||
%.i170 = fadd reassoc nnan nsz arcp contract afn float %.i168, %i16
|
||||
%i47 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i069, float %.i170, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i071 = extractelement <4 x float> %i47, i64 0
|
||||
%.i072 = fadd reassoc nnan nsz arcp contract afn float %.i071, %.i060
|
||||
%.i173 = extractelement <4 x float> %i47, i64 1
|
||||
%.i174 = fadd reassoc nnan nsz arcp contract afn float %.i173, %.i162
|
||||
%.i275 = extractelement <4 x float> %i47, i64 2
|
||||
%.i276 = fadd reassoc nnan nsz arcp contract afn float %.i275, %.i264
|
||||
%.i377 = extractelement <4 x float> %i47, i64 3
|
||||
%.i378 = fadd reassoc nnan nsz arcp contract afn float %.i377, %.i366
|
||||
%i48 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 128, i32 0), !invariant.load !0
|
||||
%i49 = shufflevector <2 x i32> %i48, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i50 = bitcast <4 x i32> %i49 to <4 x float>
|
||||
%.i079 = extractelement <4 x float> %i50, i64 0
|
||||
%.i180 = extractelement <4 x float> %i50, i64 1
|
||||
%.i081 = fadd reassoc nnan nsz arcp contract afn float %.i079, %i19
|
||||
%.i182 = fadd reassoc nnan nsz arcp contract afn float %.i180, %i16
|
||||
%i51 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i081, float %.i182, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i083 = extractelement <4 x float> %i51, i64 0
|
||||
%.i084 = fadd reassoc nnan nsz arcp contract afn float %.i083, %.i072
|
||||
%.i185 = extractelement <4 x float> %i51, i64 1
|
||||
%.i186 = fadd reassoc nnan nsz arcp contract afn float %.i185, %.i174
|
||||
%.i287 = extractelement <4 x float> %i51, i64 2
|
||||
%.i288 = fadd reassoc nnan nsz arcp contract afn float %.i287, %.i276
|
||||
%.i389 = extractelement <4 x float> %i51, i64 3
|
||||
%.i390 = fadd reassoc nnan nsz arcp contract afn float %.i389, %.i378
|
||||
%i52 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 144, i32 0), !invariant.load !0
|
||||
%i53 = shufflevector <2 x i32> %i52, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i54 = bitcast <4 x i32> %i53 to <4 x float>
|
||||
%.i091 = extractelement <4 x float> %i54, i64 0
|
||||
%.i192 = extractelement <4 x float> %i54, i64 1
|
||||
%.i093 = fadd reassoc nnan nsz arcp contract afn float %.i091, %i19
|
||||
%.i194 = fadd reassoc nnan nsz arcp contract afn float %.i192, %i16
|
||||
%i55 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i093, float %.i194, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i095 = extractelement <4 x float> %i55, i64 0
|
||||
%.i096 = fadd reassoc nnan nsz arcp contract afn float %.i095, %.i084
|
||||
%.i197 = extractelement <4 x float> %i55, i64 1
|
||||
%.i198 = fadd reassoc nnan nsz arcp contract afn float %.i197, %.i186
|
||||
%.i299 = extractelement <4 x float> %i55, i64 2
|
||||
%.i2100 = fadd reassoc nnan nsz arcp contract afn float %.i299, %.i288
|
||||
%.i3101 = extractelement <4 x float> %i55, i64 3
|
||||
%.i3102 = fadd reassoc nnan nsz arcp contract afn float %.i3101, %.i390
|
||||
%i56 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 160, i32 0), !invariant.load !0
|
||||
%i57 = shufflevector <2 x i32> %i56, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i58 = bitcast <4 x i32> %i57 to <4 x float>
|
||||
%.i0103 = extractelement <4 x float> %i58, i64 0
|
||||
%.i1104 = extractelement <4 x float> %i58, i64 1
|
||||
%.i0105 = fadd reassoc nnan nsz arcp contract afn float %.i0103, %i19
|
||||
%.i1106 = fadd reassoc nnan nsz arcp contract afn float %.i1104, %i16
|
||||
%i59 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0105, float %.i1106, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0107 = extractelement <4 x float> %i59, i64 0
|
||||
%.i0108 = fadd reassoc nnan nsz arcp contract afn float %.i0107, %.i096
|
||||
%.i1109 = extractelement <4 x float> %i59, i64 1
|
||||
%.i1110 = fadd reassoc nnan nsz arcp contract afn float %.i1109, %.i198
|
||||
%.i2111 = extractelement <4 x float> %i59, i64 2
|
||||
%.i2112 = fadd reassoc nnan nsz arcp contract afn float %.i2111, %.i2100
|
||||
%.i3113 = extractelement <4 x float> %i59, i64 3
|
||||
%.i3114 = fadd reassoc nnan nsz arcp contract afn float %.i3113, %.i3102
|
||||
%i60 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 176, i32 0), !invariant.load !0
|
||||
%i61 = shufflevector <2 x i32> %i60, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i62 = bitcast <4 x i32> %i61 to <4 x float>
|
||||
%.i0115 = extractelement <4 x float> %i62, i64 0
|
||||
%.i1116 = extractelement <4 x float> %i62, i64 1
|
||||
%.i0117 = fadd reassoc nnan nsz arcp contract afn float %.i0115, %i19
|
||||
%.i1118 = fadd reassoc nnan nsz arcp contract afn float %.i1116, %i16
|
||||
%i63 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0117, float %.i1118, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0119 = extractelement <4 x float> %i63, i64 0
|
||||
%.i0120 = fadd reassoc nnan nsz arcp contract afn float %.i0119, %.i0108
|
||||
%.i1121 = extractelement <4 x float> %i63, i64 1
|
||||
%.i1122 = fadd reassoc nnan nsz arcp contract afn float %.i1121, %.i1110
|
||||
%.i2123 = extractelement <4 x float> %i63, i64 2
|
||||
%.i2124 = fadd reassoc nnan nsz arcp contract afn float %.i2123, %.i2112
|
||||
%.i3125 = extractelement <4 x float> %i63, i64 3
|
||||
%.i3126 = fadd reassoc nnan nsz arcp contract afn float %.i3125, %.i3114
|
||||
%i64 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 192, i32 0), !invariant.load !0
|
||||
%i65 = shufflevector <2 x i32> %i64, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i66 = bitcast <4 x i32> %i65 to <4 x float>
|
||||
%.i0127 = extractelement <4 x float> %i66, i64 0
|
||||
%.i1128 = extractelement <4 x float> %i66, i64 1
|
||||
%.i0129 = fadd reassoc nnan nsz arcp contract afn float %.i0127, %i19
|
||||
%.i1130 = fadd reassoc nnan nsz arcp contract afn float %.i1128, %i16
|
||||
%i67 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0129, float %.i1130, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0131 = extractelement <4 x float> %i67, i64 0
|
||||
%.i0132 = fadd reassoc nnan nsz arcp contract afn float %.i0131, %.i0120
|
||||
%.i1133 = extractelement <4 x float> %i67, i64 1
|
||||
%.i1134 = fadd reassoc nnan nsz arcp contract afn float %.i1133, %.i1122
|
||||
%.i2135 = extractelement <4 x float> %i67, i64 2
|
||||
%.i2136 = fadd reassoc nnan nsz arcp contract afn float %.i2135, %.i2124
|
||||
%.i3137 = extractelement <4 x float> %i67, i64 3
|
||||
%.i3138 = fadd reassoc nnan nsz arcp contract afn float %.i3137, %.i3126
|
||||
%i68 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 208, i32 0), !invariant.load !0
|
||||
%i69 = shufflevector <2 x i32> %i68, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i70 = bitcast <4 x i32> %i69 to <4 x float>
|
||||
%.i0139 = extractelement <4 x float> %i70, i64 0
|
||||
%.i1140 = extractelement <4 x float> %i70, i64 1
|
||||
%.i0141 = fadd reassoc nnan nsz arcp contract afn float %.i0139, %i19
|
||||
%.i1142 = fadd reassoc nnan nsz arcp contract afn float %.i1140, %i16
|
||||
%i71 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0141, float %.i1142, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0143 = extractelement <4 x float> %i71, i64 0
|
||||
%.i0144 = fadd reassoc nnan nsz arcp contract afn float %.i0143, %.i0132
|
||||
%.i1145 = extractelement <4 x float> %i71, i64 1
|
||||
%.i1146 = fadd reassoc nnan nsz arcp contract afn float %.i1145, %.i1134
|
||||
%.i2147 = extractelement <4 x float> %i71, i64 2
|
||||
%.i2148 = fadd reassoc nnan nsz arcp contract afn float %.i2147, %.i2136
|
||||
%.i3149 = extractelement <4 x float> %i71, i64 3
|
||||
%.i3150 = fadd reassoc nnan nsz arcp contract afn float %.i3149, %.i3138
|
||||
%i72 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 224, i32 0), !invariant.load !0
|
||||
%i73 = shufflevector <2 x i32> %i72, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i74 = bitcast <4 x i32> %i73 to <4 x float>
|
||||
%.i0151 = extractelement <4 x float> %i74, i64 0
|
||||
%.i1152 = extractelement <4 x float> %i74, i64 1
|
||||
%.i0153 = fadd reassoc nnan nsz arcp contract afn float %.i0151, %i19
|
||||
%.i1154 = fadd reassoc nnan nsz arcp contract afn float %.i1152, %i16
|
||||
%i75 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0153, float %.i1154, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0155 = extractelement <4 x float> %i75, i64 0
|
||||
%.i0156 = fadd reassoc nnan nsz arcp contract afn float %.i0155, %.i0144
|
||||
%.i1157 = extractelement <4 x float> %i75, i64 1
|
||||
%.i1158 = fadd reassoc nnan nsz arcp contract afn float %.i1157, %.i1146
|
||||
%.i2159 = extractelement <4 x float> %i75, i64 2
|
||||
%.i2160 = fadd reassoc nnan nsz arcp contract afn float %.i2159, %.i2148
|
||||
%.i3161 = extractelement <4 x float> %i75, i64 3
|
||||
%.i3162 = fadd reassoc nnan nsz arcp contract afn float %.i3161, %.i3150
|
||||
%i76 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 240, i32 0), !invariant.load !0
|
||||
%i77 = shufflevector <2 x i32> %i76, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i78 = bitcast <4 x i32> %i77 to <4 x float>
|
||||
%.i0163 = extractelement <4 x float> %i78, i64 0
|
||||
%.i1164 = extractelement <4 x float> %i78, i64 1
|
||||
%.i0165 = fadd reassoc nnan nsz arcp contract afn float %.i0163, %i19
|
||||
%.i1166 = fadd reassoc nnan nsz arcp contract afn float %.i1164, %i16
|
||||
%i79 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0165, float %.i1166, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0167 = extractelement <4 x float> %i79, i64 0
|
||||
%.i0168 = fadd reassoc nnan nsz arcp contract afn float %.i0167, %.i0156
|
||||
%.i1169 = extractelement <4 x float> %i79, i64 1
|
||||
%.i1170 = fadd reassoc nnan nsz arcp contract afn float %.i1169, %.i1158
|
||||
%.i2171 = extractelement <4 x float> %i79, i64 2
|
||||
%.i2172 = fadd reassoc nnan nsz arcp contract afn float %.i2171, %.i2160
|
||||
%.i3173 = extractelement <4 x float> %i79, i64 3
|
||||
%.i3174 = fadd reassoc nnan nsz arcp contract afn float %.i3173, %.i3162
|
||||
%i80 = call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %i5, i32 256, i32 0), !invariant.load !0
|
||||
%i81 = shufflevector <2 x i32> %i80, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
%i82 = bitcast <4 x i32> %i81 to <4 x float>
|
||||
%.i0175 = extractelement <4 x float> %i82, i64 0
|
||||
%.i1176 = extractelement <4 x float> %i82, i64 1
|
||||
%.i0177 = fadd reassoc nnan nsz arcp contract afn float %.i0175, %i19
|
||||
%.i1178 = fadd reassoc nnan nsz arcp contract afn float %.i1176, %i16
|
||||
%i83 = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32.v8i32.v4i32(i32 15, float %.i0177, float %.i1178, <8 x i32> %i13, <4 x i32> %i9, i1 false, i32 0, i32 0)
|
||||
%.i0179 = extractelement <4 x float> %i83, i64 0
|
||||
%.i0180 = fadd reassoc nnan nsz arcp contract afn float %.i0179, %.i0168
|
||||
%.i1181 = extractelement <4 x float> %i83, i64 1
|
||||
%.i1182 = fadd reassoc nnan nsz arcp contract afn float %.i1181, %.i1170
|
||||
%.i2183 = extractelement <4 x float> %i83, i64 2
|
||||
%.i2184 = fadd reassoc nnan nsz arcp contract afn float %.i2183, %.i2172
|
||||
%.i3185 = extractelement <4 x float> %i83, i64 3
|
||||
%.i3186 = fadd reassoc nnan nsz arcp contract afn float %.i3185, %.i3174
|
||||
%i84 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i0180, float %.i1182)
|
||||
%i85 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %.i2184, float %.i3186)
|
||||
%i86 = bitcast <2 x half> %i84 to float
|
||||
%i87 = bitcast <2 x half> %i85 to float
|
||||
call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float %i86, float %i87, float poison, float poison, i1 true, i1 true)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare noundef i64 @llvm.amdgcn.s.getpc() #3
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #5
|
||||
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
|
||||
declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg) #4
|
||||
declare float @llvm.amdgcn.lds.param.load(i32 immarg, i32 immarg, i32) #3
|
||||
declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
|
||||
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
|
||||
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8
|
||||
|
||||
attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
|
||||
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
||||
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
|
||||
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
|
||||
attributes #8 = { nocallback nofree nosync nounwind willreturn memory(none) }
|
||||
|
||||
!0 = !{}
|
@ -1,5 +1,5 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
|
||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
|
||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -amdgpu-sched-strategy=max-ilp -verify-machineinstrs -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: max-ilp-liveness-tracking
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-enable-max-ilp-scheduling-strategy -verify-machineinstrs < %s | FileCheck %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-sched-strategy=max-ilp -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user