From d67dba5e8880bdd856b0d3812dcdccf8c7902aba Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 21 Aug 2025 11:34:49 +0100
Subject: [PATCH] [VPlan] Check Def2LaneDefs first in cloneForLane. (NFC)

If we have entries in Def2LaneDefs, we always have to use it. Move the
check before.

Otherwise we may not pick the correct operand, e.g. if Op was a
replicate recipe that got single-scalar after replicating it.

Fixes https://github.com/llvm/llvm-project/issues/154330.
---
 llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp |  14 +--
 ...licate-recipe-with-only-first-lane-used.ll | 105 ++++++++++++++++--
 2 files changed, 103 insertions(+), 16 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 62fd83a5e092..d47d76d20373 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -473,8 +473,11 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
   // Collect the operands at Lane, creating extracts as needed.
   SmallVector<VPValue *> NewOps;
   for (VPValue *Op : RepR->operands()) {
-    if (vputils::isSingleScalar(Op)) {
-      NewOps.push_back(Op);
+    // If Op is a definition that has been unrolled, directly use the clone for
+    // the corresponding lane.
+    auto LaneDefs = Def2LaneDefs.find(Op);
+    if (LaneDefs != Def2LaneDefs.end()) {
+      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
       continue;
     }
     if (Lane.getKind() == VPLane::Kind::ScalableLast) {
@@ -482,11 +485,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
           Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
       continue;
     }
-    // If Op is a definition that has been unrolled, directly use the clone for
-    // the corresponding lane.
-    auto LaneDefs = Def2LaneDefs.find(Op);
-    if (LaneDefs != Def2LaneDefs.end()) {
-      NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
+    if (vputils::isSingleScalar(Op)) {
+      NewOps.push_back(Op);
       continue;
     }
 
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index 1c6a225f99f6..e2e44b170163 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -215,11 +215,98 @@ loop.latch:
 exit:
   ret void
 }
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
-;.
+
+define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
+; CHECK-LABEL: define float @uniform_load_replicating_select(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[TMP0:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i1> [[TMP11]], i1 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP7]], ptr [[A]], ptr [[TMP15]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP8]], ptr [[A]], ptr [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP9]], ptr [[A]], ptr [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
+; CHECK-NEXT:    [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4
+; CHECK-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP26]], i32 2
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i32 3
+; CHECK-NEXT:    [[TMP32:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP32]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[A]], align 4
+; CHECK-NEXT:    [[C:%.*]] = fcmp ogt float [[L]], 0.000000e+00
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[SEL_PTR:%.*]] = select i1 [[C]], ptr [[A]], ptr [[GEP_B]]
+; CHECK-NEXT:    [[BASE:%.*]] = select i1 [[C]], float 1.000000e+01, float 1.000000e+00
+; CHECK-NEXT:    [[L_2:%.*]] = load float, ptr [[SEL_PTR]], align 4
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float 4.000000e+00, [[L_2]]
+; CHECK-NEXT:    [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[DIV]])
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[POW_LCSSA:%.*]] = phi float [ [[POW]], %[[LOOP]] ], [ [[TMP35]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[POW_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %l = load float, ptr %A, align 4
+  %c = fcmp ogt float %l, 0.000000e+00
+  %gep.B = getelementptr inbounds float, ptr %B, i64 %iv
+  %sel.ptr = select i1 %c, ptr %A, ptr %gep.B
+  %base = select i1 %c, float 10.000000e+00, float 1.000000e+00
+  %l.2 = load float, ptr %sel.ptr, align 4
+  %div = fdiv float 4.000000e+00, %l.2
+  %pow = tail call float @llvm.pow.f32(float %base, float %div)
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %1
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %pow
+}
+
+declare float @llvm.pow.f32(float, float)