diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 7812a301efbd..4cd97e726122 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2133,7 +2133,8 @@ public: VScaleRange = getVScaleRange(I->getCaller(), 64); unsigned EltWidth = getTLI()->getBitWidthForCttzElements( - RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange); + getTLI()->getValueType(DL, RetTy), ArgType.getVectorElementCount(), + ZeroIsPoison, &VScaleRange); Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth); // Create the new vector type & get the vector length diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index eac6faceafd0..efe1e7e41da1 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1580,7 +1580,7 @@ enum NodeType { EXPERIMENTAL_VECTOR_HISTOGRAM, /// Returns the number of number of trailing (least significant) zero elements - /// in a vector. Has a single i1 vector operand. The result is poison if the + /// in a vector. Has a single vector operand. The result is poison if the /// return type isn't wide enough to hold the maximum number of elements in /// the input vector. CTTZ_ELTS, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 51c00b2591ec..ec972683735f 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -498,7 +498,7 @@ public: /// Return the minimum number of bits required to hold the maximum possible /// number of trailing zero vector elements. - unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, + unsigned getBitWidthForCttzElements(EVT RetVT, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const; @@ -5820,6 +5820,10 @@ public: /// temporarily, advance store position, before re-loading the final vector. SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const; + /// Expand a CTTZ_ELTS or CTTZ_ELTS_ZERO_POISON by calculating (VL - i) for + /// each active lane (i), getting the maximum and subtracting it from VL. + SDValue expandCttzElts(SDNode *Node, SelectionDAG &DAG) const; + /// Expands PARTIAL_REDUCE_S/UMLA nodes to a series of simpler operations, /// consisting of zext/sext, extract_subvector, mul and add operations. SDValue expandPartialReduceMLA(SDNode *Node, SelectionDAG &DAG) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 3d5b838c2ff8..3d75cc4a1b4b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2167,6 +2167,10 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::VECTOR_FIND_LAST_ACTIVE: Res = PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(N, OpNo); break; + case ISD::CTTZ_ELTS: + case ISD::CTTZ_ELTS_ZERO_POISON: + Res = PromoteIntOp_CTTZ_ELTS(N); + break; case ISD::GET_ACTIVE_LANE_MASK: Res = PromoteIntOp_GET_ACTIVE_LANE_MASK(N); break; @@ -3000,6 +3004,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0); } +SDValue DAGTypeLegalizer::PromoteIntOp_CTTZ_ELTS(SDNode *N) { + SDValue Op = GetPromotedInteger(N->getOperand(0)); + return SDValue(DAG.UpdateNodeOperands(N, Op), 0); +} + SDValue DAGTypeLegalizer::PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N) { SmallVector NewOps(N->ops()); NewOps[0] = ZExtPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 14f361f8bcae..4a85c7ab39a3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -419,6 +419,7 @@ private: SDValue PromoteIntOp_VP_SPLICE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_HISTOGRAM(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); + SDValue PromoteIntOp_CTTZ_ELTS(SDNode *N); SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo); @@ -987,6 +988,7 @@ private: SDValue SplitVecOp_FPOpDifferentTypes(SDNode *N); SDValue SplitVecOp_CMP(SDNode *N); SDValue SplitVecOp_FP_TO_XINT_SAT(SDNode *N); + SDValue SplitVecOp_CttzElts(SDNode *N); SDValue SplitVecOp_VP_CttzElements(SDNode *N); SDValue SplitVecOp_VECTOR_HISTOGRAM(SDNode *N); SDValue SplitVecOp_PARTIAL_REDUCE_MLA(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 2409a1f31e26..03396ba01615 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -517,6 +517,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECREDUCE_FMIN: case ISD::VECREDUCE_FMINIMUM: case ISD::VECREDUCE_FMUL: + case ISD::CTTZ_ELTS: + case ISD::CTTZ_ELTS_ZERO_POISON: case ISD::VECTOR_FIND_LAST_ACTIVE: Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(0).getValueType()); @@ -1354,6 +1356,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::VECTOR_COMPRESS: Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG)); return; + case ISD::CTTZ_ELTS: + case ISD::CTTZ_ELTS_ZERO_POISON: + Results.push_back(TLI.expandCttzElts(Node, DAG)); + return; case ISD::VECTOR_FIND_LAST_ACTIVE: Results.push_back(TLI.expandVectorFindLastActive(Node, DAG)); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 564bf3b7f152..807863b46606 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -3742,6 +3742,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::VP_REDUCE_FMINIMUM: Res = SplitVecOp_VP_REDUCE(N, OpNo); break; + case ISD::CTTZ_ELTS: + case ISD::CTTZ_ELTS_ZERO_POISON: + Res = SplitVecOp_CttzElts(N); + break; case ISD::VP_CTTZ_ELTS: case ISD::VP_CTTZ_ELTS_ZERO_UNDEF: Res = SplitVecOp_VP_CttzElements(N); @@ -4828,6 +4832,26 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_TO_XINT_SAT(SDNode *N) { return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } +SDValue DAGTypeLegalizer::SplitVecOp_CttzElts(SDNode *N) { + SDLoc DL(N); + EVT ResVT = N->getValueType(0); + + SDValue Lo, Hi; + SDValue VecOp = N->getOperand(0); + GetSplitVector(VecOp, Lo, Hi); + + // if CTTZ_ELTS(Lo) != VL => CTTZ_ELTS(Lo). + // else => VL + (CTTZ_ELTS(Hi) or CTTZ_ELTS_ZERO_POISON(Hi)). + SDValue ResLo = DAG.getNode(ISD::CTTZ_ELTS, DL, ResVT, Lo); + SDValue VL = + DAG.getElementCount(DL, ResVT, Lo.getValueType().getVectorElementCount()); + SDValue ResLoNotVL = + DAG.getSetCC(DL, getSetCCResultType(ResVT), ResLo, VL, ISD::SETNE); + SDValue ResHi = DAG.getNode(N->getOpcode(), DL, ResVT, Hi); + return DAG.getSelect(DL, ResVT, ResLoNotVL, ResLo, + DAG.getNode(ISD::ADD, DL, ResVT, VL, ResHi)); +} + SDValue DAGTypeLegalizer::SplitVecOp_VP_CttzElements(SDNode *N) { SDLoc DL(N); EVT ResVT = N->getValueType(0); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 04b17b56b3d4..2de519e451f4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8324,55 +8324,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, return; } case Intrinsic::experimental_cttz_elts: { - auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); - EVT OpVT = Op.getValueType(); EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType()); bool ZeroIsPoison = !cast(getValue(I.getOperand(1)))->isZero(); - - if (!TLI.shouldExpandCttzElements(OpVT)) { - SDValue Ret = DAG.getNode(ZeroIsPoison ? ISD::CTTZ_ELTS_ZERO_POISON - : ISD::CTTZ_ELTS, - sdl, RetTy, Op); - setValue(&I, Ret); - return; - } - - if (OpVT.getScalarType() != MVT::i1) { - // Compare the input vector elements to zero & use to count trailing zeros - SDValue AllZero = DAG.getConstant(0, DL, OpVT); - OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, - OpVT.getVectorElementCount()); - Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE); - } - - // If the zero-is-poison flag is set, we can assume the upper limit - // of the result is VF-1. - ConstantRange VScaleRange(1, true); // Dummy value. - if (isa(I.getOperand(0)->getType())) - VScaleRange = getVScaleRange(I.getCaller(), 64); - unsigned EltWidth = TLI.getBitWidthForCttzElements( - I.getType(), OpVT.getVectorElementCount(), ZeroIsPoison, &VScaleRange); - - MVT NewEltTy = MVT::getIntegerVT(EltWidth); - - // Create the new vector type & get the vector length - EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy, - OpVT.getVectorElementCount()); - - SDValue VL = - DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount()); - - SDValue StepVec = DAG.getStepVector(DL, NewVT); - SDValue SplatVL = DAG.getSplat(NewVT, DL, VL); - SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec); - SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op); - SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext); - SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And); - SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max); - - SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy); + SDValue Ret = + DAG.getNode(ZeroIsPoison ? ISD::CTTZ_ELTS_ZERO_POISON : ISD::CTTZ_ELTS, + sdl, RetTy, Op); setValue(&I, Ret); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index e10cc6da4dfa..cdda71b05b72 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -10078,8 +10078,7 @@ SDValue TargetLowering::expandVectorFindLastActive(SDNode *N, VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); uint64_t EltWidth = TLI.getBitWidthForCttzElements( - EVT(getVectorIdxTy(DAG.getDataLayout())).getTypeForEVT(*DAG.getContext()), - MaskVT.getVectorElementCount(), + EVT(getVectorIdxTy(DAG.getDataLayout())), MaskVT.getVectorElementCount(), /*ZeroIsPoison=*/true, &VScaleRange); // If the step vector element type is smaller than the mask element type, // use the mask type directly to avoid widening issues. @@ -12541,6 +12540,64 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node, return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo); } +SDValue TargetLowering::expandCttzElts(SDNode *Node, SelectionDAG &DAG) const { + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + SDValue Op = Node->getOperand(0); + EVT OpVT = Op.getValueType(); + + if (OpVT.getVectorElementType() != MVT::i1) { + // Compare the input vector elements to zero & use to count trailing zeros. + SDValue AllZero = DAG.getConstant(0, DL, OpVT); + EVT I1OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, + OpVT.getVectorElementCount()); + // If cttz_elts is legal for the i1 type, use it instead of expanding. + if (isOperationLegalOrCustom(Node->getOpcode(), I1OpVT)) { + Op = DAG.getSetCC(DL, I1OpVT, Op, AllZero, ISD::SETNE); + return DAG.getNode(Node->getOpcode(), DL, VT, Op); + } + + Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE); + } + + // If the zero-is-poison flag is set, we can assume the upper limit + // of the result is VF-1. + bool ZeroIsPoison = Node->getOpcode() == ISD::CTTZ_ELTS_ZERO_POISON; + ConstantRange VScaleRange(1, true); // Dummy value. + if (OpVT.isScalableVector()) + VScaleRange = getVScaleRange(&DAG.getMachineFunction().getFunction(), 64); + unsigned EltWidth = getBitWidthForCttzElements( + VT, OpVT.getVectorElementCount(), ZeroIsPoison, &VScaleRange); + + EVT NewEltVT = MVT::getIntegerVT(EltWidth); + + // Create the new vector type & get the vector length + EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, + OpVT.getVectorElementCount()); + + // Promote types now to avoid redundant zexts. + if (getTypeAction(NewVT.getSimpleVT()) == TypePromoteInteger) { + NewVT = getTypeToTransformTo(*DAG.getContext(), NewVT); + NewEltVT = NewVT.getVectorElementType(); + } + if (getTypeAction(NewEltVT.getSimpleVT()) == TypePromoteInteger) + NewEltVT = getTypeToTransformTo(*DAG.getContext(), NewEltVT); + + SDValue VL = DAG.getElementCount(DL, NewEltVT, NewVT.getVectorElementCount()); + + SDValue StepVec = DAG.getStepVector(DL, NewVT); + SDValue SplatVL = DAG.getSplat(NewVT, DL, VL); + SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec); + SDValue Ext = DAG.getSExtOrTrunc(Op, DL, NewVT); + SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext); + SDValue Max = + DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewVT.getVectorElementType(), And); + SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltVT, VL, + DAG.getZExtOrTrunc(Max, DL, NewEltVT)); + + return DAG.getZExtOrTrunc(Sub, DL, VT); +} + SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, SelectionDAG &DAG) const { SDLoc DL(N); diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index b6d5a4c22e13..b4dc9afae725 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1347,7 +1347,7 @@ bool TargetLoweringBase::isFreeAddrSpaceCast(unsigned SrcAS, } unsigned TargetLoweringBase::getBitWidthForCttzElements( - Type *RetTy, ElementCount EC, bool ZeroIsPoison, + EVT RetVT, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const { // Find the smallest "sensible" element type to use for the expansion. ConstantRange CR(APInt(64, EC.getKnownMinValue())); @@ -1357,7 +1357,7 @@ unsigned TargetLoweringBase::getBitWidthForCttzElements( if (ZeroIsPoison) CR = CR.subtract(APInt(64, 1)); - unsigned EltWidth = RetTy->getScalarSizeInBits(); + unsigned EltWidth = RetVT.getScalarSizeInBits(); EltWidth = std::min(EltWidth, CR.getActiveBits()); EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 8f09e25bbfc2..3dca57370e03 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1733,7 +1733,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, // Find a suitable type for a stepvector. ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64)); unsigned EltWidth = getTLI()->getBitWidthForCttzElements( - MaskTy->getScalarType(), MaskTy->getElementCount(), + MaskLT.second.getScalarType(), MaskTy->getElementCount(), /*ZeroIsPoison=*/true, &VScaleRange); EltWidth = std::max(EltWidth, MaskTy->getScalarSizeInBits()); Type *StepTy = Type::getIntNTy(MaskTy->getContext(), EltWidth); diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index 33e7c69f041d..38ea26a4fb28 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -5,40 +5,31 @@ ; WITH VSCALE RANGE define i32 @ctz_nxv32i1( %a) #0 { -; CHECK-LABEL: ctz_nxv32i1: -; CHECK: // %bb.0: -; CHECK-NEXT: index z0.h, #0, #-1 -; CHECK-NEXT: cnth x8 -; CHECK-NEXT: punpklo p2.h, p0.b -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: punpklo p3.h, p1.b -; CHECK-NEXT: rdvl x9, #2 -; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: rdvl x8, #-1 -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: punpkhi p1.h, p1.b -; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: inch z0.h, all, mul #4 -; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: add z1.h, z0.h, z1.h -; CHECK-NEXT: add z4.h, z0.h, z2.h -; CHECK-NEXT: and z0.d, z0.d, z3.d -; CHECK-NEXT: add z2.h, z1.h, z2.h -; CHECK-NEXT: and z3.d, z4.d, z5.d -; CHECK-NEXT: and z1.d, z1.d, z6.d -; CHECK-NEXT: and z2.d, z2.d, z7.d -; CHECK-NEXT: umax z0.h, p0/m, z0.h, z3.h -; CHECK-NEXT: umax z1.h, p0/m, z1.h, z2.h -; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: umaxv h0, p0, z0.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and w0, w8, #0xffff -; CHECK-NEXT: ret +; NONSTREAMING-LABEL: ctz_nxv32i1: +; NONSTREAMING: // %bb.0: +; NONSTREAMING-NEXT: ptrue p2.b +; NONSTREAMING-NEXT: rdvl x8, #1 +; NONSTREAMING-NEXT: mov w10, w8 +; NONSTREAMING-NEXT: brkb p0.b, p2/z, p0.b +; NONSTREAMING-NEXT: brkb p1.b, p2/z, p1.b +; NONSTREAMING-NEXT: cntp x9, p0, p0.b +; NONSTREAMING-NEXT: incp x8, p1.b +; NONSTREAMING-NEXT: cmp w9, w10 +; NONSTREAMING-NEXT: csel w0, w9, w8, ne +; NONSTREAMING-NEXT: ret +; +; STREAMING-LABEL: ctz_nxv32i1: +; STREAMING: // %bb.0: +; STREAMING-NEXT: ptrue p2.b +; STREAMING-NEXT: rdvl x10, #1 +; STREAMING-NEXT: brkb p1.b, p2/z, p1.b +; STREAMING-NEXT: brkb p0.b, p2/z, p0.b +; STREAMING-NEXT: cntp x8, p1, p1.b +; STREAMING-NEXT: cntp x9, p0, p0.b +; STREAMING-NEXT: incb x8 +; STREAMING-NEXT: cmp w9, w10 +; STREAMING-NEXT: csel w0, w9, w8, ne +; STREAMING-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1( %a, i1 0) ret i32 %res } @@ -47,17 +38,9 @@ define i32 @ctz_nxv4i32( %a) #0 { ; CHECK-LABEL: ctz_nxv4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: cntw x9 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: index z0.s, #0, #-1 -; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: incw z0.s -; CHECK-NEXT: and z0.d, z0.d, z1.d -; CHECK-NEXT: and z0.s, z0.s, #0xff -; CHECK-NEXT: umaxv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.s ; CHECK-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32( %a, i1 0) ret i32 %res @@ -69,40 +52,9 @@ define i64 @vscale_4096( %a) #1 { ; CHECK-LABEL: vscale_4096: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: index z1.s, #0, #-1 -; CHECK-NEXT: cntw x8 -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: cnth x8 -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: incw z1.s, all, mul #4 -; CHECK-NEXT: mov z2.s, w8 -; CHECK-NEXT: punpklo p1.h, p0.b -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: add z4.s, z1.s, z2.s -; CHECK-NEXT: punpkhi p2.h, p1.b -; CHECK-NEXT: punpkhi p3.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z2.s, z0.s, z2.s -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: mov z5.s, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.d, z0.d, z3.d -; CHECK-NEXT: and z2.d, z2.d, z5.d -; CHECK-NEXT: and z3.d, z4.d, z6.d -; CHECK-NEXT: and z1.d, z1.d, z7.d -; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s -; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: umaxv s0, p0, z0.s -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w0, w9, w8 +; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: ret %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8( %a, i1 0) ret i64 %res @@ -112,26 +64,9 @@ define i64 @vscale_4096_poison( %a) #1 { ; CHECK-LABEL: vscale_4096_poison: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: index z1.h, #0, #-1 -; CHECK-NEXT: cnth x8 -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: mov z0.h, w8 -; CHECK-NEXT: inch z1.h, all, mul #2 -; CHECK-NEXT: punpkhi p1.h, p0.b -; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z0.d, z0.d, z2.d -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h -; CHECK-NEXT: umaxv h0, p0, z0.h -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and x0, x8, #0xffff +; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0 +; CHECK-NEXT: brkb p0.b, p0/z, p1.b +; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: ret %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8( %a, i1 1) ret i64 %res diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll index a7ffefdecb5f..56720d62c019 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll @@ -14,12 +14,11 @@ define i8 @ctz_v8i1(<8 x i1> %a) { ; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.8b, v0.8b, #7 ; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 ; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI0_0] -; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: umaxv b0, v0.8b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w0, w9, w8 @@ -48,16 +47,14 @@ define i32 @ctz_v16i1(<16 x i1> %a) { ; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: adrp x8, .LCPI1_0 +; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: bic v0.16b, v1.16b, v0.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0) ret i32 %res @@ -79,8 +76,7 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: umaxv h0, v0.4h ; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: sub w8, w9, w8 -; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) ret i16 %res @@ -98,12 +94,11 @@ define i7 @ctz_i7_v8i1(<8 x i1> %a) { ; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_i7_v8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.8b, v0.8b, #7 ; CHECK-NEXT: adrp x8, .LCPI3_0 +; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 ; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0] -; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: umaxv b0, v0.8b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w0, w9, w8 @@ -126,12 +121,11 @@ define i8 @ctz_v8i1_poison(<8 x i1> %a) { ; CHECK-NEXT: .byte 1 ; CHECK-LABEL: ctz_v8i1_poison: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.8b, v0.8b, #7 ; CHECK-NEXT: adrp x8, .LCPI4_0 +; CHECK-NEXT: cmeq v0.8b, v0.8b, #0 ; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] -; CHECK-NEXT: cmlt v0.8b, v0.8b, #0 -; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: bic v0.8b, v1.8b, v0.8b ; CHECK-NEXT: umaxv b0, v0.8b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w0, w9, w8 diff --git a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll index e4bad94f08b4..9aa673ee6cce 100644 --- a/llvm/test/CodeGen/AArch64/sve-mask-partition.ll +++ b/llvm/test/CodeGen/AArch64/sve-mask-partition.ll @@ -223,163 +223,17 @@ define <2 x i1> @mask_include_active_v2(<2 x i1> %mask.in) { define @mask_exclude_active_nxv32( %mask.in) { ; CHECK-LABEL: mask_exclude_active_nxv32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-9 -; CHECK-NEXT: str p11, [sp] // 2-byte Spill -; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Spill -; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Spill -; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Spill -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Spill -; CHECK-NEXT: str z15, [sp, #1, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z14, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z13, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z12, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z11, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z10, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z9, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: str z8, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xc8, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x40, 0x1c // $d8 @ cfa - 8 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x40, 0x1c // $d9 @ cfa - 16 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x40, 0x1c // $d10 @ cfa - 24 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x40, 0x1c // $d11 @ cfa - 32 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x40, 0x1c // $d12 @ cfa - 40 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x40, 0x1c // $d13 @ cfa - 48 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x40, 0x1c // $d14 @ cfa - 56 * VG - 16 -; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x09, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x40, 0x1c // $d15 @ cfa - 64 * VG - 16 -; CHECK-NEXT: index z2.d, #0, #-1 -; CHECK-NEXT: cnth x8 -; CHECK-NEXT: punpkhi p5.h, p0.b -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: punpkhi p4.h, p1.b -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: punpklo p3.h, p5.b -; CHECK-NEXT: rdvl x8, #-1 -; CHECK-NEXT: punpklo p2.h, p4.b -; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: neg x8, x9 -; CHECK-NEXT: incd z2.d, all, mul #16 -; CHECK-NEXT: punpklo p10.h, p0.b -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: punpklo p9.h, p3.b -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: rdvl x9, #2 -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: add z4.d, z2.d, z0.d -; CHECK-NEXT: punpklo p8.h, p2.b -; CHECK-NEXT: mov z7.d, p9/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: punpklo p6.h, p10.b -; CHECK-NEXT: mov z28.d, x8 -; CHECK-NEXT: add z25.d, z2.d, z5.d -; CHECK-NEXT: punpklo p7.h, p1.b -; CHECK-NEXT: mov z3.d, p8/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z6.d, z4.d, z1.d -; CHECK-NEXT: punpklo p8.h, p6.b -; CHECK-NEXT: and z4.d, z4.d, z7.d -; CHECK-NEXT: punpkhi p0.h, p1.b -; CHECK-NEXT: add z28.d, z2.d, z28.d -; CHECK-NEXT: add z26.d, z25.d, z0.d -; CHECK-NEXT: punpkhi p1.h, p10.b -; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Reload -; CHECK-NEXT: mov z7.d, p8/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: punpklo p11.h, p7.b -; CHECK-NEXT: and z3.d, z6.d, z3.d -; CHECK-NEXT: add z6.d, z2.d, z1.d -; CHECK-NEXT: punpklo p9.h, p0.b -; CHECK-NEXT: add z29.d, z25.d, z1.d -; CHECK-NEXT: add z5.d, z28.d, z5.d -; CHECK-NEXT: punpklo p8.h, p1.b -; CHECK-NEXT: mov z24.d, p11/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p11, [sp] // 2-byte Reload -; CHECK-NEXT: punpkhi p5.h, p5.b -; CHECK-NEXT: mov z27.d, p9/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z31.d, z26.d, z1.d -; CHECK-NEXT: punpkhi p4.h, p4.b -; CHECK-NEXT: mov z30.d, p8/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z2.d, z2.d, z7.d -; CHECK-NEXT: punpklo p9.h, p5.b -; CHECK-NEXT: and z6.d, z6.d, z24.d -; CHECK-NEXT: add z12.d, z5.d, z1.d -; CHECK-NEXT: punpklo p8.h, p4.b -; CHECK-NEXT: and z7.d, z29.d, z27.d -; CHECK-NEXT: add z29.d, z28.d, z0.d -; CHECK-NEXT: mov z24.d, p9/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Reload -; CHECK-NEXT: punpkhi p3.h, p3.b -; CHECK-NEXT: mov z8.d, p8/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Reload -; CHECK-NEXT: punpkhi p2.h, p2.b -; CHECK-NEXT: add z0.d, z5.d, z0.d -; CHECK-NEXT: punpkhi p7.h, p7.b -; CHECK-NEXT: and z25.d, z25.d, z30.d -; CHECK-NEXT: punpkhi p6.h, p6.b -; CHECK-NEXT: and z24.d, z26.d, z24.d -; CHECK-NEXT: mov z10.d, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z26.d, z31.d, z8.d -; CHECK-NEXT: punpkhi p1.h, p1.b -; CHECK-NEXT: mov z8.d, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: add z27.d, z28.d, z1.d -; CHECK-NEXT: mov z30.d, p7/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Reload -; CHECK-NEXT: punpkhi p3.h, p5.b -; CHECK-NEXT: mov z31.d, p6/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Reload -; CHECK-NEXT: punpkhi p2.h, p4.b -; CHECK-NEXT: add z9.d, z29.d, z1.d -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Reload -; CHECK-NEXT: mov z11.d, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: mov z13.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Reload -; CHECK-NEXT: mov z14.d, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z1.d, z0.d, z1.d -; CHECK-NEXT: mov z15.d, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z27.d, z27.d, z30.d -; CHECK-NEXT: and z28.d, z28.d, z31.d -; CHECK-NEXT: and z29.d, z29.d, z8.d -; CHECK-NEXT: ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: and z30.d, z9.d, z10.d -; CHECK-NEXT: ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: and z5.d, z5.d, z11.d -; CHECK-NEXT: ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: and z31.d, z12.d, z13.d -; CHECK-NEXT: ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: and z0.d, z0.d, z14.d -; CHECK-NEXT: ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: and z1.d, z1.d, z15.d -; CHECK-NEXT: ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: umax z3.d, p0/m, z3.d, z4.d -; CHECK-NEXT: umax z2.d, p0/m, z2.d, z6.d -; CHECK-NEXT: umax z7.d, p0/m, z7.d, z25.d -; CHECK-NEXT: umax z24.d, p0/m, z24.d, z26.d -; CHECK-NEXT: umax z27.d, p0/m, z27.d, z28.d -; CHECK-NEXT: umax z29.d, p0/m, z29.d, z30.d -; CHECK-NEXT: umax z5.d, p0/m, z5.d, z31.d -; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d -; CHECK-NEXT: umax z2.d, p0/m, z2.d, z3.d -; CHECK-NEXT: umax z7.d, p0/m, z7.d, z24.d -; CHECK-NEXT: umax z27.d, p0/m, z27.d, z29.d -; CHECK-NEXT: umax z0.d, p0/m, z0.d, z5.d -; CHECK-NEXT: umax z2.d, p0/m, z2.d, z7.d -; CHECK-NEXT: umax z0.d, p0/m, z0.d, z27.d -; CHECK-NEXT: umax z0.d, p0/m, z0.d, z2.d -; CHECK-NEXT: umaxv d0, p0, z0.d -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: sub x8, x9, x8 -; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: whilelo p0.b, xzr, x8 -; CHECK-NEXT: whilelo p1.b, x9, x8 -; CHECK-NEXT: addvl sp, sp, #9 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ptrue p2.b +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov x9, x8 +; CHECK-NEXT: brkb p0.b, p2/z, p0.b +; CHECK-NEXT: brkb p1.b, p2/z, p1.b +; CHECK-NEXT: cntp x10, p0, p0.b +; CHECK-NEXT: incp x9, p1.b +; CHECK-NEXT: cmp x10, x8 +; CHECK-NEXT: csel x9, x10, x9, ne +; CHECK-NEXT: whilelo p0.b, xzr, x9 +; CHECK-NEXT: whilelo p1.b, x8, x9 ; CHECK-NEXT: ret %tz.elts = call i64 @llvm.experimental.cttz.elts.i64.nxv32i1( %mask.in, i1 false) %mask.out = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 0, i64 %tz.elts) @@ -392,16 +246,13 @@ define <32 x i1> @mask_exclude_active_v32(<32 x i1> %mask.in) { ; CHECK-NEXT: ldr w9, [sp, #64] ; CHECK-NEXT: fmov s0, w0 ; CHECK-NEXT: ldr w10, [sp, #72] -; CHECK-NEXT: index z2.b, #0, #-1 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: ldr w9, [sp, #80] ; CHECK-NEXT: mov v0.b[1], w1 ; CHECK-NEXT: mov v1.b[1], w10 ; CHECK-NEXT: ldr w10, [sp, #128] -; CHECK-NEXT: mov z3.d, z2.d -; CHECK-NEXT: add z2.b, z2.b, #32 // =0x20 ; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add z3.b, z3.b, #16 // =0x10 ; CHECK-NEXT: mov v1.b[2], w9 ; CHECK-NEXT: ldr w9, [sp, #88] ; CHECK-NEXT: mov v0.b[3], w3 @@ -448,72 +299,71 @@ define <32 x i1> @mask_exclude_active_v32(<32 x i1> %mask.in) { ; CHECK-NEXT: mov v1.b[14], w10 ; CHECK-NEXT: ldr w10, [sp, #184] ; CHECK-NEXT: mov v0.b[15], w9 -; CHECK-NEXT: mov w9, #32 // =0x20 ; CHECK-NEXT: mov v1.b[15], w10 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 -; CHECK-NEXT: and v2.16b, v0.16b, v2.16b +; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: and v1.16b, v1.16b, v3.16b +; CHECK-NEXT: cmpne p2.b, p0/z, z1.b, #0 +; CHECK-NEXT: brkb p1.b, p0/z, p1.b +; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z6.d, z0.d ; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: umax v2.16b, v2.16b, v1.16b -; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: brkb p0.b, p0/z, p2.b ; CHECK-NEXT: mov z5.d, z0.d ; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: cntp x9, p1, p1.b +; CHECK-NEXT: mov z16.d, z0.d ; CHECK-NEXT: mov z17.d, z0.d +; CHECK-NEXT: cntp x10, p0, p0.b ; CHECK-NEXT: mov z18.d, z0.d ; CHECK-NEXT: mov z19.d, z0.d ; CHECK-NEXT: mov z20.d, z0.d ; CHECK-NEXT: mov z21.d, z0.d -; CHECK-NEXT: umaxv b16, v2.16b -; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: mov z22.d, z0.d -; CHECK-NEXT: mov z23.d, z0.d ; CHECK-NEXT: add z1.d, z1.d, #14 // =0xe ; CHECK-NEXT: add z3.d, z3.d, #12 // =0xc ; CHECK-NEXT: add z6.d, z6.d, #10 // =0xa +; CHECK-NEXT: cmp x9, #16 ; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 -; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 ; CHECK-NEXT: add z2.d, z2.d, #6 // =0x6 +; CHECK-NEXT: add x10, x10, #16 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 ; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 -; CHECK-NEXT: add z17.d, z17.d, #30 // =0x1e -; CHECK-NEXT: fmov w10, s16 -; CHECK-NEXT: add z18.d, z18.d, #28 // =0x1c -; CHECK-NEXT: add z19.d, z19.d, #26 // =0x1a -; CHECK-NEXT: add z20.d, z20.d, #24 // =0x18 -; CHECK-NEXT: add z21.d, z21.d, #22 // =0x16 -; CHECK-NEXT: add z22.d, z22.d, #20 // =0x14 -; CHECK-NEXT: add z23.d, z23.d, #18 // =0x12 -; CHECK-NEXT: sub w9, w9, w10 -; CHECK-NEXT: and x9, x9, #0xff -; CHECK-NEXT: dup v16.2d, x9 +; CHECK-NEXT: csel x9, x9, x10, ne +; CHECK-NEXT: add z16.d, z16.d, #30 // =0x1e +; CHECK-NEXT: add z17.d, z17.d, #28 // =0x1c +; CHECK-NEXT: dup v23.2d, x9 +; CHECK-NEXT: add z18.d, z18.d, #26 // =0x1a +; CHECK-NEXT: add z19.d, z19.d, #24 // =0x18 +; CHECK-NEXT: add z20.d, z20.d, #22 // =0x16 +; CHECK-NEXT: add z21.d, z21.d, #20 // =0x14 +; CHECK-NEXT: add z22.d, z22.d, #18 // =0x12 ; CHECK-NEXT: adrp x9, .LCPI17_0 -; CHECK-NEXT: cmhi v24.2d, v16.2d, v0.2d +; CHECK-NEXT: cmhi v24.2d, v23.2d, v0.2d ; CHECK-NEXT: add z0.d, z0.d, #16 // =0x10 -; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d -; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d -; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d -; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d -; CHECK-NEXT: cmhi v17.2d, v16.2d, v17.2d -; CHECK-NEXT: cmhi v18.2d, v16.2d, v18.2d -; CHECK-NEXT: cmhi v19.2d, v16.2d, v19.2d -; CHECK-NEXT: cmhi v20.2d, v16.2d, v20.2d -; CHECK-NEXT: cmhi v21.2d, v16.2d, v21.2d -; CHECK-NEXT: cmhi v22.2d, v16.2d, v22.2d -; CHECK-NEXT: cmhi v23.2d, v16.2d, v23.2d -; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d -; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d -; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d -; CHECK-NEXT: cmhi v7.2d, v16.2d, v7.2d +; CHECK-NEXT: cmhi v1.2d, v23.2d, v1.2d +; CHECK-NEXT: cmhi v3.2d, v23.2d, v3.2d +; CHECK-NEXT: cmhi v6.2d, v23.2d, v6.2d +; CHECK-NEXT: cmhi v4.2d, v23.2d, v4.2d +; CHECK-NEXT: cmhi v16.2d, v23.2d, v16.2d +; CHECK-NEXT: cmhi v17.2d, v23.2d, v17.2d +; CHECK-NEXT: cmhi v18.2d, v23.2d, v18.2d +; CHECK-NEXT: cmhi v19.2d, v23.2d, v19.2d +; CHECK-NEXT: cmhi v20.2d, v23.2d, v20.2d +; CHECK-NEXT: cmhi v21.2d, v23.2d, v21.2d +; CHECK-NEXT: cmhi v22.2d, v23.2d, v22.2d +; CHECK-NEXT: cmhi v0.2d, v23.2d, v0.2d +; CHECK-NEXT: cmhi v2.2d, v23.2d, v2.2d +; CHECK-NEXT: cmhi v5.2d, v23.2d, v5.2d +; CHECK-NEXT: cmhi v7.2d, v23.2d, v7.2d ; CHECK-NEXT: uzp1 v1.4s, v3.4s, v1.4s -; CHECK-NEXT: uzp1 v3.4s, v18.4s, v17.4s -; CHECK-NEXT: uzp1 v16.4s, v20.4s, v19.4s -; CHECK-NEXT: uzp1 v17.4s, v22.4s, v21.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v23.4s +; CHECK-NEXT: uzp1 v3.4s, v17.4s, v16.4s +; CHECK-NEXT: uzp1 v16.4s, v19.4s, v18.4s +; CHECK-NEXT: uzp1 v17.4s, v21.4s, v20.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v22.4s ; CHECK-NEXT: uzp1 v4.4s, v4.4s, v6.4s ; CHECK-NEXT: uzp1 v2.4s, v5.4s, v2.4s ; CHECK-NEXT: uzp1 v5.4s, v24.4s, v7.4s diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll index cdaed030e274..fc892c1a5cae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll @@ -7,44 +7,26 @@ define i32 @ctz_nxv4i32( %a) #0 { ; RV32-LABEL: ctz_nxv4i32: ; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV32-NEXT: vmsne.vi v10, v8, 0 +; RV32-NEXT: vfirst.m a0, v10 +; RV32-NEXT: bgez a0, .LBB0_2 +; RV32-NEXT: # %bb.1: ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; RV32-NEXT: vid.v v10 -; RV32-NEXT: li a1, -1 -; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV32-NEXT: vmsne.vi v0, v8, 0 ; RV32-NEXT: srli a0, a0, 1 -; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vmadd.vx v10, a1, v8 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 -; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a1, v8 -; RV32-NEXT: sub a0, a0, a1 -; RV32-NEXT: slli a0, a0, 16 -; RV32-NEXT: srli a0, a0, 16 +; RV32-NEXT: .LBB0_2: ; RV32-NEXT: ret ; ; RV64-LABEL: ctz_nxv4i32: ; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; RV64-NEXT: vmsne.vi v10, v8, 0 +; RV64-NEXT: vfirst.m a0, v10 +; RV64-NEXT: bgez a0, .LBB0_2 +; RV64-NEXT: # %bb.1: ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; RV64-NEXT: vid.v v10 -; RV64-NEXT: li a1, -1 -; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma -; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: srli a0, a0, 1 -; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v10, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 -; RV64-NEXT: vredmaxu.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: sub a0, a0, a1 -; RV64-NEXT: slli a0, a0, 48 -; RV64-NEXT: srli a0, a0, 48 +; RV64-NEXT: .LBB0_2: ; RV64-NEXT: ret %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32( %a, i1 0) ret i32 %res @@ -55,74 +37,25 @@ define i32 @ctz_nxv4i32( %a) #0 { define i64 @ctz_nxv8i1_no_range( %a) { ; RV32-LABEL: ctz_nxv8i1_no_range: ; RV32: # %bb.0: -; RV32-NEXT: addi sp, sp, -48 -; RV32-NEXT: .cfi_def_cfa_offset 48 -; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill -; RV32-NEXT: .cfi_offset ra, -4 +; RV32-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; RV32-NEXT: vmsne.vi v10, v8, 0 +; RV32-NEXT: vfirst.m a0, v10 +; RV32-NEXT: bgez a0, .LBB1_2 +; RV32-NEXT: # %bb.1: ; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: slli a0, a0, 1 -; RV32-NEXT: sub sp, sp, a0 -; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb -; RV32-NEXT: addi a0, sp, 32 -; RV32-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: srli a0, a0, 3 -; RV32-NEXT: li a2, 8 +; RV32-NEXT: .LBB1_2: ; RV32-NEXT: li a1, 0 -; RV32-NEXT: li a3, 0 -; RV32-NEXT: call __muldi3 -; RV32-NEXT: sw a0, 16(sp) -; RV32-NEXT: sw a1, 20(sp) -; RV32-NEXT: addi a2, sp, 16 -; RV32-NEXT: vsetvli a3, zero, e64, m8, ta, ma -; RV32-NEXT: vlse64.v v16, (a2), zero -; RV32-NEXT: vid.v v8 -; RV32-NEXT: li a2, -1 -; RV32-NEXT: addi a3, sp, 32 -; RV32-NEXT: vl2r.v v24, (a3) # vscale x 16-byte Folded Reload -; RV32-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV32-NEXT: vmsne.vi v0, v24, 0 -; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV32-NEXT: vmadd.vx v8, a2, v16 -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: li a2, 32 -; RV32-NEXT: vmerge.vim v16, v16, -1, v0 -; RV32-NEXT: vand.vv v8, v8, v16 -; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a3, v8 -; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV32-NEXT: vsrl.vx v8, v8, a2 -; RV32-NEXT: sltu a2, a0, a3 -; RV32-NEXT: vmv.x.s a4, v8 -; RV32-NEXT: sub a1, a1, a4 -; RV32-NEXT: sub a1, a1, a2 -; RV32-NEXT: sub a0, a0, a3 -; RV32-NEXT: csrr a2, vlenb -; RV32-NEXT: slli a2, a2, 1 -; RV32-NEXT: add sp, sp, a2 -; RV32-NEXT: .cfi_def_cfa sp, 48 -; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload -; RV32-NEXT: .cfi_restore ra -; RV32-NEXT: addi sp, sp, 48 -; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: ctz_nxv8i1_no_range: ; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; RV64-NEXT: vmsne.vi v10, v8, 0 +; RV64-NEXT: vfirst.m a0, v10 +; RV64-NEXT: bgez a0, .LBB1_2 +; RV64-NEXT: # %bb.1: ; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma -; RV64-NEXT: vid.v v16 -; RV64-NEXT: li a1, -1 -; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma -; RV64-NEXT: vmsne.vi v0, v8, 0 -; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma -; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v16, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 -; RV64-NEXT: vredmaxu.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a1, v8 -; RV64-NEXT: sub a0, a0, a1 +; RV64-NEXT: .LBB1_2: ; RV64-NEXT: ret %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i16( %a, i1 0) ret i64 %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll index 632c9a5a7591..cb91e5dfe73c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz-elts.ll @@ -8,40 +8,428 @@ define i16 @ctz_v4i32(<4 x i32> %a) { ; RV32-LABEL: ctz_v4i32: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vmsne.vi v0, v8, 0 -; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vim v8, v8, -1, v0 -; RV32-NEXT: vid.v v9 -; RV32-NEXT: vrsub.vi v9, v9, 4 -; RV32-NEXT: vand.vv v8, v8, v9 -; RV32-NEXT: vredmaxu.vs v8, v8, v8 -; RV32-NEXT: vmv.x.s a0, v8 -; RV32-NEXT: li a1, 4 -; RV32-NEXT: sub a1, a1, a0 -; RV32-NEXT: zext.b a0, a1 +; RV32-NEXT: vmsne.vi v8, v8, 0 +; RV32-NEXT: vfirst.m a0, v8 +; RV32-NEXT: bgez a0, .LBB0_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 4 +; RV32-NEXT: .LBB0_2: ; RV32-NEXT: ret ; ; RV64-LABEL: ctz_v4i32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vmsne.vi v0, v8, 0 -; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vim v8, v8, -1, v0 -; RV64-NEXT: vid.v v9 -; RV64-NEXT: vrsub.vi v9, v9, 4 -; RV64-NEXT: vand.vv v8, v8, v9 -; RV64-NEXT: vredmaxu.vs v8, v8, v8 -; RV64-NEXT: vmv.x.s a0, v8 -; RV64-NEXT: li a1, 4 -; RV64-NEXT: sub a1, a1, a0 -; RV64-NEXT: zext.b a0, a1 +; RV64-NEXT: vmsne.vi v8, v8, 0 +; RV64-NEXT: vfirst.m a0, v8 +; RV64-NEXT: bgez a0, .LBB0_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 4 +; RV64-NEXT: .LBB0_2: ; RV64-NEXT: ret %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0) ret i16 %res } +define i16 @ctz_v2048i1(<2048 x i1> %a) { +; RV32-LABEL: ctz_v2048i1: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 128 +; RV32-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-NEXT: vfirst.m a0, v0 +; RV32-NEXT: bltz a0, .LBB1_32 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: vfirst.m a2, v8 +; RV32-NEXT: bltz a2, .LBB1_33 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: beq a0, a1, .LBB1_34 +; RV32-NEXT: .LBB1_3: +; RV32-NEXT: vfirst.m a3, v9 +; RV32-NEXT: bltz a3, .LBB1_35 +; RV32-NEXT: .LBB1_4: +; RV32-NEXT: vfirst.m a2, v10 +; RV32-NEXT: bltz a2, .LBB1_36 +; RV32-NEXT: .LBB1_5: +; RV32-NEXT: beq a3, a1, .LBB1_37 +; RV32-NEXT: .LBB1_6: +; RV32-NEXT: li a2, 256 +; RV32-NEXT: beq a0, a2, .LBB1_38 +; RV32-NEXT: .LBB1_7: +; RV32-NEXT: vfirst.m a4, v11 +; RV32-NEXT: bltz a4, .LBB1_39 +; RV32-NEXT: .LBB1_8: +; RV32-NEXT: vfirst.m a3, v12 +; RV32-NEXT: bltz a3, .LBB1_40 +; RV32-NEXT: .LBB1_9: +; RV32-NEXT: beq a4, a1, .LBB1_41 +; RV32-NEXT: .LBB1_10: +; RV32-NEXT: vfirst.m a3, v13 +; RV32-NEXT: bltz a3, .LBB1_42 +; RV32-NEXT: .LBB1_11: +; RV32-NEXT: vfirst.m a5, v14 +; RV32-NEXT: bltz a5, .LBB1_43 +; RV32-NEXT: .LBB1_12: +; RV32-NEXT: beq a3, a1, .LBB1_44 +; RV32-NEXT: .LBB1_13: +; RV32-NEXT: beq a4, a2, .LBB1_45 +; RV32-NEXT: .LBB1_14: +; RV32-NEXT: li a3, 512 +; RV32-NEXT: beq a0, a3, .LBB1_46 +; RV32-NEXT: .LBB1_15: +; RV32-NEXT: vfirst.m a4, v15 +; RV32-NEXT: bltz a4, .LBB1_47 +; RV32-NEXT: .LBB1_16: +; RV32-NEXT: vfirst.m a5, v16 +; RV32-NEXT: bltz a5, .LBB1_48 +; RV32-NEXT: .LBB1_17: +; RV32-NEXT: beq a4, a1, .LBB1_49 +; RV32-NEXT: .LBB1_18: +; RV32-NEXT: vfirst.m a5, v17 +; RV32-NEXT: bltz a5, .LBB1_50 +; RV32-NEXT: .LBB1_19: +; RV32-NEXT: vfirst.m a6, v18 +; RV32-NEXT: bltz a6, .LBB1_51 +; RV32-NEXT: .LBB1_20: +; RV32-NEXT: beq a5, a1, .LBB1_52 +; RV32-NEXT: .LBB1_21: +; RV32-NEXT: beq a4, a2, .LBB1_53 +; RV32-NEXT: .LBB1_22: +; RV32-NEXT: vfirst.m a5, v19 +; RV32-NEXT: bltz a5, .LBB1_54 +; RV32-NEXT: .LBB1_23: +; RV32-NEXT: vfirst.m a6, v20 +; RV32-NEXT: bltz a6, .LBB1_55 +; RV32-NEXT: .LBB1_24: +; RV32-NEXT: beq a5, a1, .LBB1_56 +; RV32-NEXT: .LBB1_25: +; RV32-NEXT: vfirst.m a6, v21 +; RV32-NEXT: bltz a6, .LBB1_57 +; RV32-NEXT: .LBB1_26: +; RV32-NEXT: vfirst.m a7, v22 +; RV32-NEXT: bltz a7, .LBB1_58 +; RV32-NEXT: .LBB1_27: +; RV32-NEXT: beq a6, a1, .LBB1_59 +; RV32-NEXT: .LBB1_28: +; RV32-NEXT: beq a5, a2, .LBB1_60 +; RV32-NEXT: .LBB1_29: +; RV32-NEXT: beq a4, a3, .LBB1_61 +; RV32-NEXT: .LBB1_30: +; RV32-NEXT: li a1, 1024 +; RV32-NEXT: beq a0, a1, .LBB1_62 +; RV32-NEXT: .LBB1_31: +; RV32-NEXT: ret +; RV32-NEXT: .LBB1_32: +; RV32-NEXT: li a0, 128 +; RV32-NEXT: vfirst.m a2, v8 +; RV32-NEXT: bgez a2, .LBB1_2 +; RV32-NEXT: .LBB1_33: +; RV32-NEXT: li a2, 128 +; RV32-NEXT: bne a0, a1, .LBB1_3 +; RV32-NEXT: .LBB1_34: +; RV32-NEXT: addi a0, a2, 128 +; RV32-NEXT: vfirst.m a3, v9 +; RV32-NEXT: bgez a3, .LBB1_4 +; RV32-NEXT: .LBB1_35: +; RV32-NEXT: li a3, 128 +; RV32-NEXT: vfirst.m a2, v10 +; RV32-NEXT: bgez a2, .LBB1_5 +; RV32-NEXT: .LBB1_36: +; RV32-NEXT: li a2, 128 +; RV32-NEXT: bne a3, a1, .LBB1_6 +; RV32-NEXT: .LBB1_37: +; RV32-NEXT: addi a3, a2, 128 +; RV32-NEXT: li a2, 256 +; RV32-NEXT: bne a0, a2, .LBB1_7 +; RV32-NEXT: .LBB1_38: +; RV32-NEXT: addi a0, a3, 256 +; RV32-NEXT: vfirst.m a4, v11 +; RV32-NEXT: bgez a4, .LBB1_8 +; RV32-NEXT: .LBB1_39: +; RV32-NEXT: li a4, 128 +; RV32-NEXT: vfirst.m a3, v12 +; RV32-NEXT: bgez a3, .LBB1_9 +; RV32-NEXT: .LBB1_40: +; RV32-NEXT: li a3, 128 +; RV32-NEXT: bne a4, a1, .LBB1_10 +; RV32-NEXT: .LBB1_41: +; RV32-NEXT: addi a4, a3, 128 +; RV32-NEXT: vfirst.m a3, v13 +; RV32-NEXT: bgez a3, .LBB1_11 +; RV32-NEXT: .LBB1_42: +; RV32-NEXT: li a3, 128 +; RV32-NEXT: vfirst.m a5, v14 +; RV32-NEXT: bgez a5, .LBB1_12 +; RV32-NEXT: .LBB1_43: +; RV32-NEXT: li a5, 128 +; RV32-NEXT: bne a3, a1, .LBB1_13 +; RV32-NEXT: .LBB1_44: +; RV32-NEXT: addi a3, a5, 128 +; RV32-NEXT: bne a4, a2, .LBB1_14 +; RV32-NEXT: .LBB1_45: +; RV32-NEXT: addi a4, a3, 256 +; RV32-NEXT: li a3, 512 +; RV32-NEXT: bne a0, a3, .LBB1_15 +; RV32-NEXT: .LBB1_46: +; RV32-NEXT: addi a0, a4, 512 +; RV32-NEXT: vfirst.m a4, v15 +; RV32-NEXT: bgez a4, .LBB1_16 +; RV32-NEXT: .LBB1_47: +; RV32-NEXT: li a4, 128 +; RV32-NEXT: vfirst.m a5, v16 +; RV32-NEXT: bgez a5, .LBB1_17 +; RV32-NEXT: .LBB1_48: +; RV32-NEXT: li a5, 128 +; RV32-NEXT: bne a4, a1, .LBB1_18 +; RV32-NEXT: .LBB1_49: +; RV32-NEXT: addi a4, a5, 128 +; RV32-NEXT: vfirst.m a5, v17 +; RV32-NEXT: bgez a5, .LBB1_19 +; RV32-NEXT: .LBB1_50: +; RV32-NEXT: li a5, 128 +; RV32-NEXT: vfirst.m a6, v18 +; RV32-NEXT: bgez a6, .LBB1_20 +; RV32-NEXT: .LBB1_51: +; RV32-NEXT: li a6, 128 +; RV32-NEXT: bne a5, a1, .LBB1_21 +; RV32-NEXT: .LBB1_52: +; RV32-NEXT: addi a5, a6, 128 +; RV32-NEXT: bne a4, a2, .LBB1_22 +; RV32-NEXT: .LBB1_53: +; RV32-NEXT: addi a4, a5, 256 +; RV32-NEXT: vfirst.m a5, v19 +; RV32-NEXT: bgez a5, .LBB1_23 +; RV32-NEXT: .LBB1_54: +; RV32-NEXT: li a5, 128 +; RV32-NEXT: vfirst.m a6, v20 +; RV32-NEXT: bgez a6, .LBB1_24 +; RV32-NEXT: .LBB1_55: +; RV32-NEXT: li a6, 128 +; RV32-NEXT: bne a5, a1, .LBB1_25 +; RV32-NEXT: .LBB1_56: +; RV32-NEXT: addi a5, a6, 128 +; RV32-NEXT: vfirst.m a6, v21 +; RV32-NEXT: bgez a6, .LBB1_26 +; RV32-NEXT: .LBB1_57: +; RV32-NEXT: li a6, 128 +; RV32-NEXT: vfirst.m a7, v22 +; RV32-NEXT: bgez a7, .LBB1_27 +; RV32-NEXT: .LBB1_58: +; RV32-NEXT: li a7, 128 +; RV32-NEXT: bne a6, a1, .LBB1_28 +; RV32-NEXT: .LBB1_59: +; RV32-NEXT: addi a6, a7, 128 +; RV32-NEXT: bne a5, a2, .LBB1_29 +; RV32-NEXT: .LBB1_60: +; RV32-NEXT: addi a5, a6, 256 +; RV32-NEXT: bne a4, a3, .LBB1_30 +; RV32-NEXT: .LBB1_61: +; RV32-NEXT: addi a4, a5, 512 +; RV32-NEXT: li a1, 1024 +; RV32-NEXT: bne a0, a1, .LBB1_31 +; RV32-NEXT: .LBB1_62: +; RV32-NEXT: addi a0, a4, 1024 +; RV32-NEXT: ret +; +; RV64-LABEL: ctz_v2048i1: +; RV64: # %bb.0: +; RV64-NEXT: li a1, 128 +; RV64-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV64-NEXT: vfirst.m a0, v0 +; RV64-NEXT: bltz a0, .LBB1_32 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: vfirst.m a2, v8 +; RV64-NEXT: bltz a2, .LBB1_33 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: beq a0, a1, .LBB1_34 +; RV64-NEXT: .LBB1_3: +; RV64-NEXT: vfirst.m a3, v9 +; RV64-NEXT: bltz a3, .LBB1_35 +; RV64-NEXT: .LBB1_4: +; RV64-NEXT: vfirst.m a2, v10 +; RV64-NEXT: bltz a2, .LBB1_36 +; RV64-NEXT: .LBB1_5: +; RV64-NEXT: beq a3, a1, .LBB1_37 +; RV64-NEXT: .LBB1_6: +; RV64-NEXT: li a2, 256 +; RV64-NEXT: beq a0, a2, .LBB1_38 +; RV64-NEXT: .LBB1_7: +; RV64-NEXT: vfirst.m a4, v11 +; RV64-NEXT: bltz a4, .LBB1_39 +; RV64-NEXT: .LBB1_8: +; RV64-NEXT: vfirst.m a3, v12 +; RV64-NEXT: bltz a3, .LBB1_40 +; RV64-NEXT: .LBB1_9: +; RV64-NEXT: beq a4, a1, .LBB1_41 +; RV64-NEXT: .LBB1_10: +; RV64-NEXT: vfirst.m a3, v13 +; RV64-NEXT: bltz a3, .LBB1_42 +; RV64-NEXT: .LBB1_11: +; RV64-NEXT: vfirst.m a5, v14 +; RV64-NEXT: bltz a5, .LBB1_43 +; RV64-NEXT: .LBB1_12: +; RV64-NEXT: beq a3, a1, .LBB1_44 +; RV64-NEXT: .LBB1_13: +; RV64-NEXT: beq a4, a2, .LBB1_45 +; RV64-NEXT: .LBB1_14: +; RV64-NEXT: li a3, 512 +; RV64-NEXT: beq a0, a3, .LBB1_46 +; RV64-NEXT: .LBB1_15: +; RV64-NEXT: vfirst.m a4, v15 +; RV64-NEXT: bltz a4, .LBB1_47 +; RV64-NEXT: .LBB1_16: +; RV64-NEXT: vfirst.m a5, v16 +; RV64-NEXT: bltz a5, .LBB1_48 +; RV64-NEXT: .LBB1_17: +; RV64-NEXT: beq a4, a1, .LBB1_49 +; RV64-NEXT: .LBB1_18: +; RV64-NEXT: vfirst.m a5, v17 +; RV64-NEXT: bltz a5, .LBB1_50 +; RV64-NEXT: .LBB1_19: +; RV64-NEXT: vfirst.m a6, v18 +; RV64-NEXT: bltz a6, .LBB1_51 +; RV64-NEXT: .LBB1_20: +; RV64-NEXT: beq a5, a1, .LBB1_52 +; RV64-NEXT: .LBB1_21: +; RV64-NEXT: beq a4, a2, .LBB1_53 +; RV64-NEXT: .LBB1_22: +; RV64-NEXT: vfirst.m a5, v19 +; RV64-NEXT: bltz a5, .LBB1_54 +; RV64-NEXT: .LBB1_23: +; RV64-NEXT: vfirst.m a6, v20 +; RV64-NEXT: bltz a6, .LBB1_55 +; RV64-NEXT: .LBB1_24: +; RV64-NEXT: beq a5, a1, .LBB1_56 +; RV64-NEXT: .LBB1_25: +; RV64-NEXT: vfirst.m a6, v21 +; RV64-NEXT: bltz a6, .LBB1_57 +; RV64-NEXT: .LBB1_26: +; RV64-NEXT: vfirst.m a7, v22 +; RV64-NEXT: bltz a7, .LBB1_58 +; RV64-NEXT: .LBB1_27: +; RV64-NEXT: beq a6, a1, .LBB1_59 +; RV64-NEXT: .LBB1_28: +; RV64-NEXT: beq a5, a2, .LBB1_60 +; RV64-NEXT: .LBB1_29: +; RV64-NEXT: beq a4, a3, .LBB1_61 +; RV64-NEXT: .LBB1_30: +; RV64-NEXT: li a1, 1024 +; RV64-NEXT: beq a0, a1, .LBB1_62 +; RV64-NEXT: .LBB1_31: +; RV64-NEXT: ret +; RV64-NEXT: .LBB1_32: +; RV64-NEXT: li a0, 128 +; RV64-NEXT: vfirst.m a2, v8 +; RV64-NEXT: bgez a2, .LBB1_2 +; RV64-NEXT: .LBB1_33: +; RV64-NEXT: li a2, 128 +; RV64-NEXT: bne a0, a1, .LBB1_3 +; RV64-NEXT: .LBB1_34: +; RV64-NEXT: addi a0, a2, 128 +; RV64-NEXT: vfirst.m a3, v9 +; RV64-NEXT: bgez a3, .LBB1_4 +; RV64-NEXT: .LBB1_35: +; RV64-NEXT: li a3, 128 +; RV64-NEXT: vfirst.m a2, v10 +; RV64-NEXT: bgez a2, .LBB1_5 +; RV64-NEXT: .LBB1_36: +; RV64-NEXT: li a2, 128 +; RV64-NEXT: bne a3, a1, .LBB1_6 +; RV64-NEXT: .LBB1_37: +; RV64-NEXT: addi a3, a2, 128 +; RV64-NEXT: li a2, 256 +; RV64-NEXT: bne a0, a2, .LBB1_7 +; RV64-NEXT: .LBB1_38: +; RV64-NEXT: addi a0, a3, 256 +; RV64-NEXT: vfirst.m a4, v11 +; RV64-NEXT: bgez a4, .LBB1_8 +; RV64-NEXT: .LBB1_39: +; RV64-NEXT: li a4, 128 +; RV64-NEXT: vfirst.m a3, v12 +; RV64-NEXT: bgez a3, .LBB1_9 +; RV64-NEXT: .LBB1_40: +; RV64-NEXT: li a3, 128 +; RV64-NEXT: bne a4, a1, .LBB1_10 +; RV64-NEXT: .LBB1_41: +; RV64-NEXT: addi a4, a3, 128 +; RV64-NEXT: vfirst.m a3, v13 +; RV64-NEXT: bgez a3, .LBB1_11 +; RV64-NEXT: .LBB1_42: +; RV64-NEXT: li a3, 128 +; RV64-NEXT: vfirst.m a5, v14 +; RV64-NEXT: bgez a5, .LBB1_12 +; RV64-NEXT: .LBB1_43: +; RV64-NEXT: li a5, 128 +; RV64-NEXT: bne a3, a1, .LBB1_13 +; RV64-NEXT: .LBB1_44: +; RV64-NEXT: addi a3, a5, 128 +; RV64-NEXT: bne a4, a2, .LBB1_14 +; RV64-NEXT: .LBB1_45: +; RV64-NEXT: addi a4, a3, 256 +; RV64-NEXT: li a3, 512 +; RV64-NEXT: bne a0, a3, .LBB1_15 +; RV64-NEXT: .LBB1_46: +; RV64-NEXT: addi a0, a4, 512 +; RV64-NEXT: vfirst.m a4, v15 +; RV64-NEXT: bgez a4, .LBB1_16 +; RV64-NEXT: .LBB1_47: +; RV64-NEXT: li a4, 128 +; RV64-NEXT: vfirst.m a5, v16 +; RV64-NEXT: bgez a5, .LBB1_17 +; RV64-NEXT: .LBB1_48: +; RV64-NEXT: li a5, 128 +; RV64-NEXT: bne a4, a1, .LBB1_18 +; RV64-NEXT: .LBB1_49: +; RV64-NEXT: addi a4, a5, 128 +; RV64-NEXT: vfirst.m a5, v17 +; RV64-NEXT: bgez a5, .LBB1_19 +; RV64-NEXT: .LBB1_50: +; RV64-NEXT: li a5, 128 +; RV64-NEXT: vfirst.m a6, v18 +; RV64-NEXT: bgez a6, .LBB1_20 +; RV64-NEXT: .LBB1_51: +; RV64-NEXT: li a6, 128 +; RV64-NEXT: bne a5, a1, .LBB1_21 +; RV64-NEXT: .LBB1_52: +; RV64-NEXT: addi a5, a6, 128 +; RV64-NEXT: bne a4, a2, .LBB1_22 +; RV64-NEXT: .LBB1_53: +; RV64-NEXT: addi a4, a5, 256 +; RV64-NEXT: vfirst.m a5, v19 +; RV64-NEXT: bgez a5, .LBB1_23 +; RV64-NEXT: .LBB1_54: +; RV64-NEXT: li a5, 128 +; RV64-NEXT: vfirst.m a6, v20 +; RV64-NEXT: bgez a6, .LBB1_24 +; RV64-NEXT: .LBB1_55: +; RV64-NEXT: li a6, 128 +; RV64-NEXT: bne a5, a1, .LBB1_25 +; RV64-NEXT: .LBB1_56: +; RV64-NEXT: addi a5, a6, 128 +; RV64-NEXT: vfirst.m a6, v21 +; RV64-NEXT: bgez a6, .LBB1_26 +; RV64-NEXT: .LBB1_57: +; RV64-NEXT: li a6, 128 +; RV64-NEXT: vfirst.m a7, v22 +; RV64-NEXT: bgez a7, .LBB1_27 +; RV64-NEXT: .LBB1_58: +; RV64-NEXT: li a7, 128 +; RV64-NEXT: bne a6, a1, .LBB1_28 +; RV64-NEXT: .LBB1_59: +; RV64-NEXT: addi a6, a7, 128 +; RV64-NEXT: bne a5, a2, .LBB1_29 +; RV64-NEXT: .LBB1_60: +; RV64-NEXT: addi a5, a6, 256 +; RV64-NEXT: bne a4, a3, .LBB1_30 +; RV64-NEXT: .LBB1_61: +; RV64-NEXT: addi a4, a5, 512 +; RV64-NEXT: li a1, 1024 +; RV64-NEXT: bne a0, a1, .LBB1_31 +; RV64-NEXT: .LBB1_62: +; RV64-NEXT: addi a0, a4, 1024 +; RV64-NEXT: ret + %res = call i16 @llvm.experimental.cttz.elts(<2048 x i1> %a, i1 0) + ret i16 %res +} + ; ZERO IS POISON define i32 @ctz_v2i1_poison(<2 x i1> %a) {