From 94ec7ffa46d351b86fbbe3a445ceef37f331c4a2 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 2 Apr 2026 06:47:53 -0400 Subject: [PATCH] [SLP] Do not skip tiny trees with gathered loads to vectorize The isTreeTinyAndNotFullyVectorizable check for 2-node trees (insertelement root + gather child) was too aggressive: it rejected trees even when LoadEntriesToVectorize was non-empty, preventing gathered loads from being vectorized into masked loads/strided loads, etc. Reviewers: RKSimon, hiraditya Pull Request: https://github.com/llvm/llvm-project/pull/190040 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../SLPVectorizer/RISCV/complex-loads.ll | 6 ++-- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 28 ++++--------------- .../Transforms/SLPVectorizer/X86/pr47629.ll | 28 ++++--------------- 4 files changed, 15 insertions(+), 49 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f7c78db5a83a..68bdc89b0a24 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -16896,7 +16896,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // No need to vectorize inserts of gathered values. if (VectorizableTree.size() == 2 && isa(VectorizableTree[0]->Scalars[0]) && - VectorizableTree[1]->isGather() && + LoadEntriesToVectorize.empty() && VectorizableTree[1]->isGather() && (VectorizableTree[1]->getVectorFactor() <= 2 || !(isSplat(VectorizableTree[1]->Scalars) || allConstant(VectorizableTree[1]->Scalars)))) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 621f5363070a..90dc555d7bc5 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -154,8 +154,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1 -; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP0:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 zeroinitializer, <2 x i1> splat (i1 true), <2 x i8> poison) ; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 ; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32> ; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 @@ -223,8 +222,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32> ; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]] ; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> -; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0 -; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1 +; THR15-NEXT: [[TMP70:%.*]] = shufflevector <2 x i8> [[TMP0]], <2 x i8> poison, <4 x i32> ; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> ; THR15-NEXT: [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP116]], <4 x i32> ; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 8b58d0cdccf2..4c8e88ca3407 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -68,40 +68,24 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl ; ; AVX512F-LABEL: define void @gather_load( ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]] ; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 -; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 +; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> , <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> ; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: define void @gather_load( ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]] ; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 -; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 -; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0 -; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 +; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> , <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> ; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 2d6f007cb234..82f888610725 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -68,40 +68,24 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl ; ; AVX512F-LABEL: define void @gather_load( ; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 -; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]] ; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 -; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 -; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 -; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 -; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 +; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> , <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]] +; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> ; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: define void @gather_load( ; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { -; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1 -; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]] ; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1 -; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11 -; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 -; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3 -; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]] -; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0 -; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1 -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2 -; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3 +; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> , <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]] +; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> ; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], ; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]] ; AVX512VL-NEXT: ret void