From 70aebae2a13114f4e3d5e2460c052d8f3de295be Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 10 Feb 2026 08:59:44 -0500 Subject: [PATCH] [SLP]Support for zext i1 %x modeling as select %x, 1, 0 Model zext i1 %x to in as select i1 %x, in 1, in 0 in case, if there are other select instructions, which can be combined into a bundle. Fixes #178403 Reviewers: hiraditya, RKSimon Pull Request: https://github.com/llvm/llvm-project/pull/180635 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 89 ++++- .../Transforms/SLPVectorizer/X86/bool-mask.ll | 378 ++++-------------- .../subvector-minbitwidth-unsigned-value.ll | 14 +- 3 files changed, 158 insertions(+), 323 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index f89c22fafcf0..f18b6fc4dd95 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5761,12 +5761,15 @@ private: // moment are extracts where their second (immediate) operand is // not added. Since immediates do not affect scheduler behavior // this is considered okay. - assert(In && - (isa(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands() || - Bundle->getTreeEntry()->isCopyableElement(In)) && - "Missed TreeEntry operands?"); + assert( + In && + (isa(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + (isa(In) && Bundle->getTreeEntry()->getOpcode() == + Instruction::Select) || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); // Count the number of unique phi nodes, which are the parent for // parent entry, and exit, if all the unique phis are processed. @@ -11345,7 +11348,6 @@ class InstructionsCompatibilityAnalysis { case Instruction::BitCast: case Instruction::ICmp: case Instruction::FCmp: - case Instruction::Select: case Instruction::FNeg: case Instruction::Add: case Instruction::FAdd: @@ -11381,6 +11383,30 @@ class InstructionsCompatibilityAnalysis { Ops[Idx] = ConvertedOps[OpIdx]; } return; + case Instruction::Select: + Operands.assign(VL0->getNumOperands(), {VL.size(), nullptr}); + for (auto [Idx, V] : enumerate(VL)) { + auto *I = dyn_cast(V); + if (!I) { + for (auto [OpIdx, Ops] : enumerate(Operands)) + Ops[Idx] = PoisonValue::get(VL0->getOperand(OpIdx)->getType()); + continue; + } + if (isa(I)) { + // Special case for select + zext i1 to avoid explosion of different + // types. We want to keep the condition as i1 to be able to match + // different selects together and reuse the vectorized condition + // rather than trying to gather it. + Operands[0][Idx] = I->getOperand(0); + Operands[1][Idx] = ConstantInt::get(I->getType(), 1); + Operands[2][Idx] = ConstantInt::getNullValue(I->getType()); + continue; + } + auto [Op, ConvertedOps] = convertTo(I, S); + for (auto [OpIdx, Ops] : enumerate(Operands)) + Ops[Idx] = ConvertedOps[OpIdx]; + } + return; case Instruction::GetElementPtr: { Operands.assign(2, {VL.size(), nullptr}); // Need to cast all indices to the same type before vectorization to @@ -11453,6 +11479,22 @@ public: : getSameOpcode(VL, TLI); if (S) return S; + // Check if series of selects + zext i1 %x to in can be combined into + // selects + select %x, i32 1, i32 0. + Instruction *SelectOp = nullptr; + if (allSameBlock(VL) && all_of(VL, [&](Value *V) { + if (match(V, m_Select(m_Value(), m_Value(), m_Value()))) { + if (!SelectOp) + SelectOp = cast(V); + return true; + } + auto *ZExt = dyn_cast(V); + return (ZExt && ZExt->getSrcTy()->isIntegerTy(1)) || + isa(V); + })) { + if (SelectOp) + return InstructionsState(SelectOp, SelectOp); + } if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) return S; findAndSetMainInstruction(VL, R); @@ -15481,6 +15523,10 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, if (isa(UniqueValues[Idx])) return InstructionCost(TTI::TCC_Free); + if (!isa(UniqueValues[Idx])) + return TTI->getInstructionCost(cast(UniqueValues[Idx]), + CostKind); + auto *VI = cast(UniqueValues[Idx]); CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy() ? CmpInst::BAD_FCMP_PREDICATE @@ -25243,6 +25289,32 @@ private: (I && !isa(I) && isValidForAlternation(I->getOpcode())); } + /// Optimizes original placement of the reduced values for the reduction tree. + /// For example, if there is a zext i1 + selects, we can merge select + /// into zext and improve emission of the reductions. + void optimizeReducedVals() { + SmallDenseMap UsedReductionOpIds; + for (const auto [Idx, Vals] : enumerate(ReducedVals)) { + if (auto *I = dyn_cast(Vals.front())) + UsedReductionOpIds.try_emplace(I->getOpcode(), Idx); + } + // Check if zext i1 can be merged with select. + auto ZExtIt = UsedReductionOpIds.find(Instruction::ZExt); + auto SelectIt = UsedReductionOpIds.find(Instruction::Select); + if (ZExtIt != UsedReductionOpIds.end() && + SelectIt != UsedReductionOpIds.end()) { + unsigned ZExtIdx = ZExtIt->second; + unsigned SelectIdx = SelectIt->second; + auto *ZExt = cast(ReducedVals[ZExtIdx].front()); + // ZExt is compatible with Select? Merge select to zext, if so. + if (ZExt->getSrcTy()->isIntegerTy(1) && + ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) { + ReducedVals[ZExtIdx].append(ReducedVals[SelectIdx]); + ReducedVals.erase(std::next(ReducedVals.begin(), SelectIdx)); + } + } + } + public: HorizontalReduction() = default; HorizontalReduction(Instruction *I, ArrayRef Ops) @@ -25418,6 +25490,9 @@ public: ReducedVals.back().append(Data.rbegin(), Data.rend()); } } + // Post optimize reduced values to get better reduction sequences and sort + // them by size. + optimizeReducedVals(); // Sort the reduced values by number of same/alternate opcode and/or pointer // operand. stable_sort(ReducedVals, [](ArrayRef P1, ArrayRef P2) { diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll index e1ee35217d18..c1928fbf6ba6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -17,110 +17,32 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { ; SSE-LABEL: @bitmask_16xi8( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; SSE-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; SSE-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; SSE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; SSE-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; SSE-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> -; SSE-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP12]], <8 x i32> -; SSE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]] -; SSE-NEXT: [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]] -; SSE-NEXT: [[OP_RDX6:%.*]] = or i64 [[OP_RDX]], [[OP_RDX5]] -; SSE-NEXT: [[OP_RDX7:%.*]] = or i64 [[OP_RDX6]], [[OR]] +; SSE-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; SSE-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> +; SSE-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> , <16 x i64> +; SSE-NEXT: [[OP_RDX7:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]]) ; SSE-NEXT: ret i64 [[OP_RDX7]] ; ; AVX-LABEL: @bitmask_16xi8( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; AVX-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; AVX-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; AVX-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; AVX-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; AVX-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; AVX-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; AVX-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; AVX-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; AVX-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; AVX-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; AVX-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> -; AVX-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] -; AVX-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> -; AVX-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]] -; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]] -; AVX-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]] +; AVX-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; AVX-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> +; AVX-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> , <16 x i64> +; AVX-NEXT: [[OP_RDX4:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]]) ; AVX-NEXT: ret i64 [[OP_RDX4]] ; ; AVX512-LABEL: @bitmask_16xi8( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = load i8, ptr [[SRC:%.*]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 -; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; AVX512-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; AVX512-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; AVX512-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; AVX512-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> -; AVX512-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 -; AVX512-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 -; AVX512-NEXT: [[OR_13:%.*]] = select i1 [[TOBOOL_NOT_13]], i64 0, i64 8192 -; AVX512-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 14 -; AVX512-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX_14]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT_14:%.*]] = icmp eq i8 [[TMP8]], 0 -; AVX512-NEXT: [[OR_14:%.*]] = select i1 [[TOBOOL_NOT_14]], i64 0, i64 16384 -; AVX512-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 15 -; AVX512-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 -; AVX512-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 -; AVX512-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; AVX512-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> -; AVX512-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] -; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> -; AVX512-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP13]], <8 x i32> -; AVX512-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP12]], [[OR_13]] -; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OR_14]], [[OR_15]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX]], [[OP_RDX2]] -; AVX512-NEXT: [[OP_RDX4:%.*]] = or i64 [[OP_RDX3]], [[OR]] +; AVX512-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[SRC:%.*]], align 1 +; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <16 x i8> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <16 x i1> [[TMP1]], <16 x i1> [[TMP2]], <16 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> , <16 x i64> +; AVX512-NEXT: [[OP_RDX4:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP4]]) ; AVX512-NEXT: ret i64 [[OP_RDX4]] ; entry: @@ -208,84 +130,32 @@ entry: define i64 @bitmask_4xi16(ptr nocapture noundef readonly %src) { ; SSE-LABEL: @bitmask_4xi16( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; SSE-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2 +; SSE-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; SSE-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; SSE-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_4xi16( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 -; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 -; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i16 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i16 [[TMP6]], 0 -; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2 +; AVX-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_4xi16( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC:%.*]], align 2 -; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i16 [[TMP0]], 0 -; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX_1]], align 2 -; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 -; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i16 [[TMP4]], 0 -; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[SRC]], i64 6 -; AVX512-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_6]], align 2 -; AVX512-NEXT: [[TMP6:%.*]] = icmp eq <2 x i16> [[TMP5]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]] -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 -; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]] -; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX512-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[SRC:%.*]], align 2 +; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX512-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -333,84 +203,32 @@ entry: define i64 @bitmask_8xi32(ptr nocapture noundef readonly %src) { ; SSE-LABEL: @bitmask_8xi32( ; SSE-NEXT: entry: -; SSE-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 -; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; SSE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; SSE-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; SSE-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; SSE-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; SSE-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; SSE-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 -; SSE-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; SSE-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; SSE-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; SSE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4 +; SSE-NEXT: [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; SSE-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; SSE-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_8xi32( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 -; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX_6]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i32 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX_7]], align 4 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i32 [[TMP6]], 0 -; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4 +; AVX-NEXT: [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_8xi32( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC:%.*]], align 4 -; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i32 [[TMP0]], 0 -; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_1]], align 4 -; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_5]], align 4 -; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i32 [[TMP4]], 0 -; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 6 -; AVX512-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[ARRAYIDX_6]], align 4 -; AVX512-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]] -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 -; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]] -; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX512-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 4 +; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <8 x i32> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i32> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX512-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: @@ -500,84 +318,32 @@ define i64 @bitmask_8xi64(ptr nocapture noundef readonly %src) { ; ; SSE4-LABEL: @bitmask_8xi64( ; SSE4-NEXT: entry: -; SSE4-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0 -; SSE4-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; SSE4-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1 -; SSE4-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8 -; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer -; SSE4-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; SSE4-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; SSE4-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 -; SSE4-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; SSE4-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; SSE4-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; SSE4-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; SSE4-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; SSE4-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; SSE4-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 -; SSE4-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; SSE4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; SSE4-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; SSE4-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; SSE4-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; SSE4-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; SSE4-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8 +; SSE4-NEXT: [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer +; SSE4-NEXT: [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer +; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; SSE4-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; SSE4-NEXT: ret i64 [[OP_RDX3]] ; ; AVX-LABEL: @bitmask_8xi64( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8 -; AVX-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0 -; AVX-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1 -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8 -; AVX-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer -; AVX-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; AVX-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 -; AVX-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; AVX-NEXT: [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX_6]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_6:%.*]] = icmp eq i64 [[TMP5]], 0 -; AVX-NEXT: [[OR_6:%.*]] = select i1 [[TOBOOL_NOT_6]], i64 0, i64 64 -; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 7 -; AVX-NEXT: [[TMP6:%.*]] = load i64, ptr [[ARRAYIDX_7]], align 8 -; AVX-NEXT: [[TOBOOL_NOT_7:%.*]] = icmp eq i64 [[TMP6]], 0 -; AVX-NEXT: [[OR_7:%.*]] = select i1 [[TOBOOL_NOT_7]], i64 0, i64 128 -; AVX-NEXT: [[TMP7:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP7]], [[OR_5]] -; AVX-NEXT: [[OP_RDX1:%.*]] = or i64 [[OR_6]], [[OR_7]] -; AVX-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8 +; AVX-NEXT: [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX-NEXT: ret i64 [[OP_RDX3]] ; ; AVX512-LABEL: @bitmask_8xi64( ; AVX512-NEXT: entry: -; AVX512-NEXT: [[TMP0:%.*]] = load i64, ptr [[SRC:%.*]], align 8 -; AVX512-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i64 [[TMP0]], 0 -; AVX512-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 -; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[ARRAYIDX_1]], align 8 -; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <4 x i64> [[TMP1]], zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> zeroinitializer, <4 x i64> -; AVX512-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 5 -; AVX512-NEXT: [[TMP4:%.*]] = load i64, ptr [[ARRAYIDX_5]], align 8 -; AVX512-NEXT: [[TOBOOL_NOT_5:%.*]] = icmp eq i64 [[TMP4]], 0 -; AVX512-NEXT: [[OR_5:%.*]] = select i1 [[TOBOOL_NOT_5]], i64 0, i64 32 -; AVX512-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 6 -; AVX512-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_6]], align 8 -; AVX512-NEXT: [[TMP6:%.*]] = icmp eq <2 x i64> [[TMP5]], zeroinitializer -; AVX512-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> zeroinitializer, <2 x i64> -; AVX512-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP3]]) -; AVX512-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP8]], [[OR_5]] -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP7]], i32 0 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP7]], i32 1 -; AVX512-NEXT: [[OP_RDX1:%.*]] = or i64 [[TMP9]], [[TMP10]] -; AVX512-NEXT: [[OP_RDX2:%.*]] = or i64 [[OP_RDX]], [[OP_RDX1]] -; AVX512-NEXT: [[OP_RDX3:%.*]] = or i64 [[OP_RDX2]], [[OR]] +; AVX512-NEXT: [[TMP0:%.*]] = load <8 x i64>, ptr [[SRC:%.*]], align 8 +; AVX512-NEXT: [[TMP1:%.*]] = icmp ne <8 x i64> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = icmp eq <8 x i64> [[TMP0]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP1]], <8 x i1> [[TMP2]], <8 x i32> +; AVX512-NEXT: [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> , <8 x i64> +; AVX512-NEXT: [[OP_RDX3:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP4]]) ; AVX512-NEXT: ret i64 [[OP_RDX3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll index 252746b465bc..428f54f15388 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll @@ -9,16 +9,10 @@ define i1 @test(i64 %v1, ptr %v2, i32 %v3, i1 %v4) { ; CHECK-NEXT: [[TT3:%.*]] = lshr i64 [[V1]], 32 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[TT3]] to i32 -; CHECK-NEXT: [[TT2:%.*]] = and i32 [[TMP1]], 255 -; CHECK-NEXT: [[TT1:%.*]] = and i32 [[TMP2]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TT1]] to i8 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> poison, i8 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = trunc i32 [[TT2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i8> [[TMP7]], i8 [[TMP33]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP34:%.*]] = insertelement <2 x i32> poison, i32 [[TT1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[TT2]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <2 x i32> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[V3]], i32 0 ; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32>