[VPlan] Check Def2LaneDefs first in cloneForLane. (NFC)

If we have entries in Def2LaneDefs, we always have to use it. Move the check before. Otherwise we may not pick the correct operand, e.g. if Op was a replicate recipe that got single-scalar after replicating it. Fixes https://github.com/llvm/llvm-project/issues/154330.
2025-08-21 11:34:49 +01:00 · 2025-08-21 11:34:49 +01:00 · d67dba5e88
commit d67dba5e88
parent b20c291bae
2 changed files with 103 additions and 16 deletions
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@ -473,8 +473,11 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
  // Collect the operands at Lane, creating extracts as needed.
  SmallVector<VPValue *> NewOps;
  for (VPValue *Op : RepR->operands()) {
-    if (vputils::isSingleScalar(Op)) {
+    // If Op is a definition that has been unrolled, directly use the clone for
-      NewOps.push_back(Op);
+    // the corresponding lane.
    auto LaneDefs = Def2LaneDefs.find(Op);
    if (LaneDefs != Def2LaneDefs.end()) {
      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
      continue;
    }
    if (Lane.getKind() == VPLane::Kind::ScalableLast) {
@ -482,11 +485,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
          Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
      continue;
    }
-    // If Op is a definition that has been unrolled, directly use the clone for
+    if (vputils::isSingleScalar(Op)) {
-    // the corresponding lane.
+      NewOps.push_back(Op);
    auto LaneDefs = Def2LaneDefs.find(Op);
    if (LaneDefs != Def2LaneDefs.end()) {
      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
      continue;
    }
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@ -215,11 +215,98 @@ loop.latch:
 exit:
  ret void
 }
-;.
+
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-LABEL: define float @uniform_load_replicating_select(
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[TMP0:%.*]]) {
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 1
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
-;.
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 5
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 7
 ; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i1> [[TMP11]], i1 [[TMP8]], i32 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP7]], ptr [[A]], ptr [[TMP15]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP8]], ptr [[A]], ptr [[TMP16]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP9]], ptr [[A]], ptr [[TMP17]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
 ; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
 ; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP25]], i32 1
 ; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP26]], i32 2
 ; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i32 3
 ; CHECK-NEXT:    [[TMP32:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP32]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[C:%.*]] = fcmp ogt float [[L]], 0.000000e+00
 ; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
 ; CHECK-NEXT:    [[SEL_PTR:%.*]] = select i1 [[C]], ptr [[A]], ptr [[GEP_B]]
 ; CHECK-NEXT:    [[BASE:%.*]] = select i1 [[C]], float 1.000000e+01, float 1.000000e+00
 ; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[SEL_PTR]], align 4
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv float 4.000000e+00, [[L_2]]
 ; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[DIV]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    [[POW_LCSSA:%.*]] = phi float [ [[POW]], %[[LOOP]] ], [ [[TMP35]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[POW_LCSSA]]
 ;
 entry:
  br label %loop
 loop:
  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  %l = load float, ptr %A, align 4
  %c = fcmp ogt float %l, 0.000000e+00
  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
  %sel.ptr = select i1 %c, ptr %A, ptr %gep.B
  %base = select i1 %c, float 10.000000e+00, float 1.000000e+00
  %l.2 = load float, ptr %sel.ptr, align 4
  %div = fdiv float 4.000000e+00, %l.2
  %pow = tail call float @llvm.pow.f32(float %base, float %div)
  %iv.next = add i64 %iv, 1
  %ec = icmp eq i64 %iv, %1
  br i1 %ec, label %exit, label %loop
 exit:
  ret float %pow
 }
 declare float @llvm.pow.f32(float, float)