[VPlan] Check Def2LaneDefs first in cloneForLane. (NFC)

If we have entries in Def2LaneDefs, we always have to use it. Move the
check before.

Otherwise we may not pick the correct operand, e.g. if Op was a
replicate recipe that got single-scalar after replicating it.

Fixes https://github.com/llvm/llvm-project/issues/154330.
This commit is contained in:
Florian Hahn 2025-08-21 11:34:49 +01:00
parent b20c291bae
commit d67dba5e88
No known key found for this signature in database
GPG Key ID: 3596DCFCBB5C67CC
2 changed files with 103 additions and 16 deletions

View File

@ -473,8 +473,11 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
// Collect the operands at Lane, creating extracts as needed. // Collect the operands at Lane, creating extracts as needed.
SmallVector<VPValue *> NewOps; SmallVector<VPValue *> NewOps;
for (VPValue *Op : RepR->operands()) { for (VPValue *Op : RepR->operands()) {
if (vputils::isSingleScalar(Op)) { // If Op is a definition that has been unrolled, directly use the clone for
NewOps.push_back(Op); // the corresponding lane.
auto LaneDefs = Def2LaneDefs.find(Op);
if (LaneDefs != Def2LaneDefs.end()) {
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
continue; continue;
} }
if (Lane.getKind() == VPLane::Kind::ScalableLast) { if (Lane.getKind() == VPLane::Kind::ScalableLast) {
@ -482,11 +485,8 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op})); Builder.createNaryOp(VPInstruction::ExtractLastElement, {Op}));
continue; continue;
} }
// If Op is a definition that has been unrolled, directly use the clone for if (vputils::isSingleScalar(Op)) {
// the corresponding lane. NewOps.push_back(Op);
auto LaneDefs = Def2LaneDefs.find(Op);
if (LaneDefs != Def2LaneDefs.end()) {
NewOps.push_back(LaneDefs->second[Lane.getKnownLane()]);
continue; continue;
} }

View File

@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s ; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@ -215,11 +215,98 @@ loop.latch:
exit: exit:
ret void ret void
} }
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} define float @uniform_load_replicating_select(ptr %A, ptr %B, i64 %1) {
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-LABEL: define float @uniform_load_replicating_select(
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i64 [[TMP0:%.*]]) {
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} ; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 1
; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
;. ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 5
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 6
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 7
; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[TMP7:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
; CHECK-NEXT: [[TMP9:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP6]], 0.000000e+00
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i1> [[TMP11]], i1 [[TMP8]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i1> [[TMP12]], i1 [[TMP9]], i32 2
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i1> [[TMP13]], i1 [[TMP10]], i32 3
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP7]], ptr [[A]], ptr [[TMP15]]
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP8]], ptr [[A]], ptr [[TMP16]]
; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[TMP9]], ptr [[A]], ptr [[TMP17]]
; CHECK-NEXT: [[TMP22:%.*]] = select i1 [[TMP10]], ptr [[A]], ptr [[TMP18]]
; CHECK-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP14]], <4 x float> splat (float 1.000000e+01), <4 x float> splat (float 1.000000e+00)
; CHECK-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP19]], align 4
; CHECK-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP20]], align 4
; CHECK-NEXT: [[TMP26:%.*]] = load float, ptr [[TMP21]], align 4
; CHECK-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP22]], align 4
; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x float> poison, float [[TMP24]], i32 0
; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x float> [[TMP28]], float [[TMP25]], i32 1
; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x float> [[TMP29]], float [[TMP26]], i32 2
; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x float> [[TMP30]], float [[TMP27]], i32 3
; CHECK-NEXT: [[TMP32:%.*]] = fdiv <4 x float> splat (float 4.000000e+00), [[TMP31]]
; CHECK-NEXT: [[TMP33:%.*]] = call <4 x float> @llvm.pow.v4f32(<4 x float> [[TMP23]], <4 x float> [[TMP32]])
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP34]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i32 3
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: br label %[[LOOP:.*]]
; CHECK: [[LOOP]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
; CHECK-NEXT: [[L:%.*]] = load float, ptr [[A]], align 4
; CHECK-NEXT: [[C:%.*]] = fcmp ogt float [[L]], 0.000000e+00
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[IV]]
; CHECK-NEXT: [[SEL_PTR:%.*]] = select i1 [[C]], ptr [[A]], ptr [[GEP_B]]
; CHECK-NEXT: [[BASE:%.*]] = select i1 [[C]], float 1.000000e+01, float 1.000000e+00
; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[SEL_PTR]], align 4
; CHECK-NEXT: [[DIV:%.*]] = fdiv float 4.000000e+00, [[L_2]]
; CHECK-NEXT: [[POW:%.*]] = tail call float @llvm.pow.f32(float [[BASE]], float [[DIV]])
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], [[TMP0]]
; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: [[POW_LCSSA:%.*]] = phi float [ [[POW]], %[[LOOP]] ], [ [[TMP35]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[POW_LCSSA]]
;
entry:
br label %loop
loop:
%iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
%l = load float, ptr %A, align 4
%c = fcmp ogt float %l, 0.000000e+00
%gep.B = getelementptr inbounds float, ptr %B, i64 %iv
%sel.ptr = select i1 %c, ptr %A, ptr %gep.B
%base = select i1 %c, float 10.000000e+00, float 1.000000e+00
%l.2 = load float, ptr %sel.ptr, align 4
%div = fdiv float 4.000000e+00, %l.2
%pow = tail call float @llvm.pow.f32(float %base, float %div)
%iv.next = add i64 %iv, 1
%ec = icmp eq i64 %iv, %1
br i1 %ec, label %exit, label %loop
exit:
ret float %pow
}
declare float @llvm.pow.f32(float, float)