[SLP] Do not skip tiny trees with gathered loads to vectorize
The isTreeTinyAndNotFullyVectorizable check for 2-node trees (insertelement root + gather child) was too aggressive: it rejected trees even when LoadEntriesToVectorize was non-empty, preventing gathered loads from being vectorized into masked loads/strided loads, etc. Reviewers: RKSimon, hiraditya Pull Request: https://github.com/llvm/llvm-project/pull/190040
This commit is contained in:
parent
5613f77293
commit
94ec7ffa46
@ -16896,7 +16896,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
|
||||
// No need to vectorize inserts of gathered values.
|
||||
if (VectorizableTree.size() == 2 &&
|
||||
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
|
||||
VectorizableTree[1]->isGather() &&
|
||||
LoadEntriesToVectorize.empty() && VectorizableTree[1]->isGather() &&
|
||||
(VectorizableTree[1]->getVectorFactor() <= 2 ||
|
||||
!(isSplat(VectorizableTree[1]->Scalars) ||
|
||||
allConstant(VectorizableTree[1]->Scalars))))
|
||||
|
||||
@ -154,8 +154,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
|
||||
; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
|
||||
; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
|
||||
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
|
||||
; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
|
||||
; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1
|
||||
; THR15-NEXT: [[TMP0:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 zeroinitializer, <2 x i1> splat (i1 true), <2 x i8> poison)
|
||||
; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
|
||||
; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
|
||||
; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
|
||||
@ -223,8 +222,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
|
||||
; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
|
||||
; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
|
||||
; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
|
||||
; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
|
||||
; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
|
||||
; THR15-NEXT: [[TMP70:%.*]] = shufflevector <2 x i8> [[TMP0]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
|
||||
; THR15-NEXT: [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP116]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
|
||||
|
||||
@ -68,40 +68,24 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
|
||||
;
|
||||
; AVX512F-LABEL: define void @gather_load(
|
||||
; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
|
||||
; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
|
||||
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
|
||||
; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
|
||||
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
|
||||
; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
|
||||
; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
|
||||
; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
|
||||
; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
|
||||
; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
|
||||
; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
|
||||
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
|
||||
; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: ret void
|
||||
;
|
||||
; AVX512VL-LABEL: define void @gather_load(
|
||||
; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
|
||||
; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
|
||||
; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
|
||||
; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
|
||||
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
|
||||
; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
|
||||
; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
|
||||
; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
|
||||
; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
|
||||
; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
|
||||
; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
|
||||
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
|
||||
; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: ret void
|
||||
|
||||
@ -68,40 +68,24 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
|
||||
;
|
||||
; AVX512F-LABEL: define void @gather_load(
|
||||
; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
|
||||
; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
|
||||
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
|
||||
; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
|
||||
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
|
||||
; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
|
||||
; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
|
||||
; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
|
||||
; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
|
||||
; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
|
||||
; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
|
||||
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
|
||||
; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512F-NEXT: ret void
|
||||
;
|
||||
; AVX512VL-LABEL: define void @gather_load(
|
||||
; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
|
||||
; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
|
||||
; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
|
||||
; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
|
||||
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
|
||||
; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
|
||||
; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
|
||||
; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
|
||||
; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
|
||||
; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
|
||||
; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
|
||||
; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
|
||||
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
|
||||
; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
|
||||
; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
|
||||
; AVX512VL-NEXT: ret void
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user