[VPlan] Materialize Build(Struct)Vectors for VPReplicateRecipes. (NFCI) (#151487)

Materialze Build(Struct)Vectors explicitly for VPRecplicateRecipes, to
serve their users requiring a vector, instead of doing so when unrolling
by VF.

Now we only need to implicitly build vectors in VPTransformState::get
for VPInstructions. Once they are also unrolled by VF we can remove the
code-path alltogether.

PR: https://github.com/llvm/llvm-project/pull/151487
This commit is contained in:
Florian Hahn 2025-08-18 20:49:42 +01:00 committed by GitHub
parent f5a648f919
commit 7e9989390d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 101 additions and 18 deletions

View File

@ -7254,8 +7254,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
VPlanTransforms::runPass(VPlanTransforms::materializeBuildVectors, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::materializeBroadcasts, BestVPlan);
VPlanTransforms::runPass(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
bool HasBranchWeights =
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator());
if (HasBranchWeights) {

View File

@ -355,6 +355,9 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) {
set(Def, VectorValue);
} else {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
assert(isa<VPInstruction>(Def) &&
"Explicit BuildVector recipes must have"
"handled packing for non-VPInstructions.");
// Initialize packing with insertelements to start from poison.
VectorValue = PoisonValue::get(toVectorizedTy(LastInst->getType(), VF));
for (unsigned Lane = 0; Lane < VF.getFixedValue(); ++Lane)

View File

@ -460,6 +460,8 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) {
case Instruction::Load:
case VPInstruction::AnyOf:
case VPInstruction::BranchOnCond:
case VPInstruction::BuildStructVector:
case VPInstruction::BuildVector:
case VPInstruction::CalculateTripCountMinusVF:
case VPInstruction::CanonicalIVIncrementForPart:
case VPInstruction::ExplicitVectorLength:

View File

@ -3282,6 +3282,52 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
BTC->replaceAllUsesWith(TCMO);
}
void VPlanTransforms::materializeBuildVectors(VPlan &Plan) {
if (Plan.hasScalarVFOnly())
return;
VPTypeAnalysis TypeInfo(Plan);
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(Plan.getEntry()));
auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(LoopRegion->getEntry()));
// Materialize Build(Struct)Vector for all replicating VPReplicateRecipes,
// excluding ones in replicate regions. Those are not materialized explicitly
// yet. Those vector users are still handled in VPReplicateRegion::execute(),
// via shouldPack().
// TODO: materialize build vectors for replicating recipes in replicating
// regions.
// TODO: materialize build vectors for VPInstructions.
for (VPBasicBlock *VPBB :
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
auto UsesVectorOrInsideReplicateRegion = [RepR, LoopRegion](VPUser *U) {
VPRegionBlock *ParentRegion =
cast<VPRecipeBase>(U)->getParent()->getParent();
return !U->usesScalars(RepR) || ParentRegion != LoopRegion;
};
if (!RepR || RepR->isSingleScalar() ||
none_of(RepR->users(), UsesVectorOrInsideReplicateRegion))
continue;
Type *ScalarTy = TypeInfo.inferScalarType(RepR);
unsigned Opcode = ScalarTy->isStructTy()
? VPInstruction::BuildStructVector
: VPInstruction::BuildVector;
auto *BuildVector = new VPInstruction(Opcode, {RepR});
BuildVector->insertAfter(RepR);
RepR->replaceUsesWithIf(
BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
VPUser &U, unsigned) {
return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
});
}
}
}
void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
VPBasicBlock *VectorPHVPBB,
bool TailByMasking,

View File

@ -274,6 +274,10 @@ struct VPlanTransforms {
static void materializeBackedgeTakenCount(VPlan &Plan,
VPBasicBlock *VectorPH);
/// Add explicit Build[Struct]Vector recipes that combine multiple scalar
/// values into single vectors.
static void materializeBuildVectors(VPlan &Plan);
/// Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
ElementCount VF);

View File

@ -464,10 +464,12 @@ void VPlanTransforms::unrollByUF(VPlan &Plan, unsigned UF) {
VPlanTransforms::removeDeadRecipes(Plan);
}
/// Create a single-scalar clone of \p RepR for lane \p Lane.
static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
Type *IdxTy, VPReplicateRecipe *RepR,
VPLane Lane) {
/// Create a single-scalar clone of \p RepR for lane \p Lane. Use \p
/// Def2LaneDefs to look up scalar definitions for operands of \RepR.
static VPReplicateRecipe *
cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
VPReplicateRecipe *RepR, VPLane Lane,
const DenseMap<VPValue *, SmallVector<VPValue *>> &Def2LaneDefs) {
// Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps;
for (VPValue *Op : RepR->operands()) {
@ -480,6 +482,14 @@ static VPReplicateRecipe *cloneForLane(VPlan &Plan, VPBuilder &Builder,
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue;
}
// If Op is a definition that has been unrolled, directly use the clone for
// the corresponding lane.
auto LaneDefs = Def2LaneDefs.find(Op);
if (LaneDefs != Def2LaneDefs.end()) {
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
continue;
}
// Look through buildvector to avoid unnecessary extracts.
if (match(Op, m_BuildVector())) {
NewOps.push_back(
@ -512,6 +522,13 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()));
auto VPBBsToUnroll =
concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion);
// A mapping of current VPValue definitions to collections of new VPValues
// defined per lane. Serves to hook-up potential users of current VPValue
// definition that are replicated-per-VF later.
DenseMap<VPValue *, SmallVector<VPValue *>> Def2LaneDefs;
// The removal of current recipes being replaced by new ones needs to be
// delayed after Def2LaneDefs is no longer in use.
SmallVector<VPRecipeBase *> ToRemove;
for (VPBasicBlock *VPBB : VPBBsToUnroll) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
@ -523,12 +540,12 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
if (isa<StoreInst>(RepR->getUnderlyingInstr()) &&
vputils::isSingleScalar(RepR->getOperand(1))) {
// Stores to invariant addresses need to store the last lane only.
cloneForLane(Plan, Builder, IdxTy, RepR,
VPLane::getLastLaneForVF(VF));
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane::getLastLaneForVF(VF),
Def2LaneDefs);
} else {
// Create single-scalar version of RepR for all lanes.
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I));
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs);
}
RepR->eraseFromParent();
continue;
@ -536,23 +553,33 @@ void VPlanTransforms::replicateByVF(VPlan &Plan, ElementCount VF) {
/// Create single-scalar version of RepR for all lanes.
SmallVector<VPValue *> LaneDefs;
for (unsigned I = 0; I != VF.getKnownMinValue(); ++I)
LaneDefs.push_back(cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I)));
LaneDefs.push_back(
cloneForLane(Plan, Builder, IdxTy, RepR, VPLane(I), Def2LaneDefs));
Def2LaneDefs[RepR] = LaneDefs;
/// Users that only demand the first lane can use the definition for lane
/// 0.
RepR->replaceUsesWithIf(LaneDefs[0], [RepR](VPUser &U, unsigned) {
return U.onlyFirstLaneUsed(RepR);
});
// If needed, create a Build(Struct)Vector recipe to insert the scalar
// lane values into a vector.
Type *ResTy = RepR->getUnderlyingInstr()->getType();
VPValue *VecRes = Builder.createNaryOp(
ResTy->isStructTy() ? VPInstruction::BuildStructVector
: VPInstruction::BuildVector,
LaneDefs);
RepR->replaceAllUsesWith(VecRes);
RepR->eraseFromParent();
// Update each build vector user that currently has RepR as its only
// operand, to have all LaneDefs as its operands.
for (VPUser *U : to_vector(RepR->users())) {
auto *VPI = dyn_cast<VPInstruction>(U);
if (!VPI || (VPI->getOpcode() != VPInstruction::BuildVector &&
VPI->getOpcode() != VPInstruction::BuildStructVector))
continue;
assert(VPI->getNumOperands() == 1 &&
"Build(Struct)Vector must have a single operand before "
"replicating by VF");
VPI->setOperand(0, LaneDefs[0]);
for (VPValue *LaneDef : drop_begin(LaneDefs))
VPI->addOperand(LaneDef);
}
ToRemove.push_back(RepR);
}
}
for (auto *R : reverse(ToRemove))
R->eraseFromParent();
}