From 98d8b69dfc8ac946e86085dbacc2f7fa72ce9c38 Mon Sep 17 00:00:00 2001 From: Alexey Karyakin Date: Tue, 17 Feb 2026 14:56:05 -0500 Subject: [PATCH] [Hexagon] Support partial reduction intrinsics (#179797) This commit has changes necessary for using vrmpy instructions in full and partial multiply/add reductions on extended arguments. There are three main parts: - partial reduction operations PARTIAL_REDUCE_(U|S|SU)MLA are lowered to accumulating vrmpy, including native and multiples of native vector sizes; - full and partial reductions can be "split" into an inner partial reduction and a residual full or partial reduction. The inner reduction will be lowered to vrmpy due to the first change; - vecreduce_add expansion is moved to Hexagon backend from a generic pass, accompanied by a set of tests. In addition, there is a minor cleanup in HexagonTargetLowering::PerformDAGCombine(). --- .../Target/Hexagon/HexagonISelLowering.cpp | 142 ++++++-- llvm/lib/Target/Hexagon/HexagonISelLowering.h | 13 +- .../Target/Hexagon/HexagonISelLoweringHVX.cpp | 322 ++++++++++++++++-- llvm/lib/Target/Hexagon/HexagonPatternsHVX.td | 7 + .../Hexagon/HexagonTargetTransformInfo.cpp | 8 + .../Hexagon/HexagonTargetTransformInfo.h | 2 +- .../CodeGen/Hexagon/expand-vecreduce-add.ll | 143 ++++++++ llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll | 145 ++++++++ .../CodeGen/Hexagon/hvx-partial-reduce.ll | 162 +++++++++ 9 files changed, 884 insertions(+), 60 deletions(-) create mode 100644 llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll create mode 100644 llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll create mode 100644 llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index a626de6302b9..4913e96cd3f0 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1506,6 +1506,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, MaxStoresPerMemset = 8; MaxStoresPerMemsetOptSize = 4; + setTargetDAGCombine(ISD::VECREDUCE_ADD); + // // Set up register classes. // @@ -3413,16 +3415,50 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N, SDValue HexagonTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Op(N, 0); + const SDLoc &dl(Op); + unsigned Opc = Op.getOpcode(); + + // Combining transformations applicable for arbitrary vector sizes. + if (DCI.isBeforeLegalizeOps()) { + switch (Opc) { + case ISD::VECREDUCE_ADD: + if (SDValue V = splitVecReduceAdd(N, DCI.DAG)) + return V; + if (SDValue V = expandVecReduceAdd(N, DCI.DAG)) + return V; + return SDValue(); + case ISD::PARTIAL_REDUCE_SMLA: + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SUMLA: + if (SDValue V = splitExtendingPartialReduceMLA(N, DCI.DAG)) + return V; + return SDValue(); + } + } else { + switch (Opc) { + case ISD::VSELECT: { + // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0) + SDValue Cond = Op.getOperand(0); + if (Cond->getOpcode() == ISD::XOR) { + SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); + if (C1->getOpcode() == HexagonISD::PTRUE) { + SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, + Op.getOperand(2), Op.getOperand(1)); + return VSel; + } + } + return SDValue(); + } + } + } + if (isHvxOperation(N, DCI.DAG)) { if (SDValue V = PerformHvxDAGCombine(N, DCI)) return V; return SDValue(); } - SDValue Op(N, 0); - const SDLoc &dl(Op); - unsigned Opc = Op.getOpcode(); - if (Opc == ISD::TRUNCATE) { SDValue Op0 = Op.getOperand(0); // fold (truncate (build pair x, y)) -> (truncate x) or x @@ -3441,7 +3477,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, if (DCI.isBeforeLegalizeOps()) return SDValue(); - if (Opc == HexagonISD::P2D) { + switch (Opc) { + case HexagonISD::P2D: { SDValue P = Op.getOperand(0); switch (P.getOpcode()) { case HexagonISD::PTRUE: @@ -3451,20 +3488,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, default: break; } - } else if (Opc == ISD::VSELECT) { - // This is pretty much duplicated in HexagonISelLoweringHVX... - // - // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0) - SDValue Cond = Op.getOperand(0); - if (Cond->getOpcode() == ISD::XOR) { - SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); - if (C1->getOpcode() == HexagonISD::PTRUE) { - SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, - Op.getOperand(2), Op.getOperand(1)); - return VSel; - } - } - } else if (Opc == ISD::TRUNCATE) { + break; + } + case ISD::TRUNCATE: { SDValue Op0 = Op.getOperand(0); // fold (truncate (build pair x, y)) -> (truncate x) or x if (Op0.getOpcode() == ISD::BUILD_PAIR) { @@ -3477,7 +3503,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, if (ty(Elem0).bitsGT(TruncTy)) return DCI.DAG.getNode(ISD::TRUNCATE, dl, TruncTy, Elem0); } - } else if (Opc == ISD::OR) { + break; + } + case ISD::OR: { // fold (or (shl xx, s), (zext y)) -> (COMBINE (shl xx, s-32), y) // if s >= 32 auto fold0 = [&, this](SDValue Op) { @@ -3507,6 +3535,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N, if (SDValue R = fold0(Op)) return R; + break; + } } return SDValue(); @@ -3750,6 +3780,78 @@ EVT HexagonTargetLowering::getOptimalMemOpType( return MVT::Other; } +// The helpers below are versions of llvm::getShuffleReduction and +// llvm::getOrderedReduction, adapted to use during DAG passes and simplified as +// follows: +// - ICmp and FCmp are not handled; +// - in every step in getShuffleReduction, the input is split into halves (not +// pairwise). + +static SDValue getOrderedReduction(SDValue Vec, unsigned Op, + SelectionDAG &DAG) { + assert(Op != Instruction::ICmp && Op != Instruction::FCmp); + + EVT VT = Vec.getValueType(); + EVT EltT = VT.getVectorElementType(); + unsigned VF = VT.getVectorNumElements(); + assert(VF > 0 && + "Reduction emission only supported for non-zero length vectors!"); + + SDLoc DL(Vec); + SDValue Result = DAG.getExtractVectorElt(DL, EltT, Vec, 0); + for (unsigned ExtractIdx = 1; ExtractIdx < VF; ++ExtractIdx) { + SDValue Ext = DAG.getExtractVectorElt(DL, EltT, Vec, ExtractIdx); + Result = DAG.getNode(Op, DL, EltT, {Result, Ext}); + } + + return Result; +} + +static SDValue getShuffleReduction(SDValue Vec, unsigned Op, + SelectionDAG &DAG) { + assert(Op != Instruction::ICmp && Op != Instruction::FCmp); + + EVT VT = Vec.getValueType(); + unsigned VF = VT.getVectorNumElements(); + if (VF == 0) + llvm_unreachable("Vector must be non-zero length"); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + + SDLoc DL(Vec); + // TODO: Is it correct to create double-vector shuffle and fill 3/4 of it with + // undefs? + SmallVector ShuffleMask(VF); + for (unsigned i = VF; i > 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = i / 2 + j; + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1); + + SDValue Shuf = + DAG.getVectorShuffle(VT, DL, Vec, DAG.getUNDEF(VT), ShuffleMask); + + Vec = DAG.getNode(Op, DL, VT, {Vec, Shuf}); + } + // The result is in the first element of the vector. + return DAG.getExtractVectorElt(DL, VT.getVectorElementType(), Vec, 0); +} + +SDValue HexagonTargetLowering::expandVecReduceAdd(SDNode *N, + SelectionDAG &DAG) const { + // Since we disabled automatic reduction expansion, generate log2 ladder code + // if the vector is of a power-of-two length. + SDValue Input = N->getOperand(0); + if (isPowerOf2_32(Input.getValueType().getVectorNumElements())) + return getShuffleReduction(Input, ISD::ADD, DAG); + // Otherwise, reduction will be scalarized. + return getOrderedReduction(Input, ISD::ADD, DAG); +} + bool HexagonTargetLowering::allowsMemoryAccess( LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const { diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h index 76070c1f8a89..1fbe404a82c5 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -485,6 +485,7 @@ private: SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxPred32ToFp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxPred64ToFp(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxPartialReduceMLA(SDValue Op, SelectionDAG &DAG) const; SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const; SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const; @@ -519,10 +520,14 @@ private: SDValue combineTruncateBeforeLegal(SDValue Op, DAGCombinerInfo &DCI) const; SDValue combineConcatVectorsBeforeLegal(SDValue Op, DAGCombinerInfo & DCI) const; - SDValue combineVectorShuffleBeforeLegal(SDValue Op, DAGCombinerInfo & DCI) - const; - - SDValue PerformHvxDAGCombine(SDNode * N, DAGCombinerInfo & DCI) const; + SDValue expandVecReduceAdd(SDNode *N, SelectionDAG &DAG) const; + SDValue createExtendingPartialReduceMLA( + unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType, + const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio, + const SDLoc &DL, SelectionDAG &DAG) const; + SDValue splitVecReduceAdd(SDNode *N, SelectionDAG &DAG) const; + SDValue splitExtendingPartialReduceMLA(SDNode *N, SelectionDAG &DAG) const; + SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp index b1181dfa13a1..cec1f046469e 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -40,6 +40,8 @@ static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 }; static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 }; +static const unsigned MaxExpandMLA = 8; + static std::tuple getIEEEProperties(MVT Ty) { // For a float scalar type, return (exp-bits, exp-bias, fraction-bits) MVT ElemTy = Ty.getScalarType(); @@ -504,6 +506,69 @@ HexagonTargetLowering::initializeHVXLowering() { setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom); setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT}); + + setTargetDAGCombine({ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA, + ISD::PARTIAL_REDUCE_SUMLA}); + + // Partial MLA reductions. + { + static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA, + ISD::PARTIAL_REDUCE_UMLA, + ISD::PARTIAL_REDUCE_SUMLA}; + + auto HvxType = [=](MVT ScalarT, unsigned Factor = 1) { + return MVT::getVectorVT(ScalarT, Subtarget.getVectorLength() * Factor * + 8 / ScalarT.getSizeInBits()); + }; + + // Tuple of (Acc element type, input element type, vector pair). + // The assumption is both the input and reduction result are of the same + // size so the reduction ratio is the same as the ratio of element type + // sizes. This may not hold for all available instructions. + typedef std::tuple ReductionSignature; + + static const std::vector NativeReductions = { + {MVT::i32, MVT::i8, false}, + }; + + for (const auto &R : NativeReductions) { + + MVT AccType = std::get<0>(R); + MVT InputType = std::get<1>(R); + unsigned Factor = std::get<2>(R) ? 2 : 1; + + // The native size is legal. + setPartialReduceMLAAction(MLAOps, HvxType(AccType), HvxType(InputType), + Legal); + + // Allow custom partial MLA reductions on larger vectors than legally + // supported. These reduction must be declared as Custom (or Legal) + // for foldPartialReduceMLAMulOp() to fold the multiply by one pattern + // inserted when the partial reduction intrinsic is converted to + // PARTIAL_REDUCE_U/S/SUMLA. Otherwise, the Split action will apply + // on the original pattern, including the extensions and multiplies, + // which will make it impossible to match. + // There are two independent ways to extend the + // input size: 1. to concatenate the result - output vector is + // proportionally extended, 2) to reduce the result - the output vector + // size stays the same. We limit allowed combinations so that the total + // number of generated reduction instructions is limited by a constant + // number. This limit is arbitrary and can be revised. On one hand, it is + // convenient to have more choices; on the other hand, there is a + // diminishing benefit of very long sequences, which should probably be + // written as loops instead. + for (unsigned ConcatFactor = 1; ConcatFactor <= MaxExpandMLA; + ConcatFactor <<= 1) + for (unsigned ReductionFactor = 1; ReductionFactor <= MaxExpandMLA; + ReductionFactor <<= 1) + if (ConcatFactor * ReductionFactor != 1 && + ConcatFactor * ReductionFactor <= MaxExpandMLA) + setPartialReduceMLAAction( + MLAOps, HvxType(AccType, Factor * ConcatFactor), + HvxType(InputType, Factor * ConcatFactor * ReductionFactor), + Custom); + } + } } unsigned @@ -3678,6 +3743,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const { case HexagonISD::SMUL_LOHI: case HexagonISD::UMUL_LOHI: case HexagonISD::USMUL_LOHI: return LowerHvxMulLoHi(Op, DAG); + + case ISD::PARTIAL_REDUCE_SMLA: + case ISD::PARTIAL_REDUCE_UMLA: + case ISD::PARTIAL_REDUCE_SUMLA: + return LowerHvxPartialReduceMLA(Op, DAG); // clang-format on } #ifndef NDEBUG @@ -4020,6 +4090,198 @@ HexagonTargetLowering::combineConcatVectorsBeforeLegal( return DAG.getVectorShuffle(LongTy, dl, Cat, DAG.getUNDEF(LongTy), LongMask); } +// Create the inner partial reduction MLA that can be efficiently lowered. This +// function is used by partial and full reductions. +SDValue HexagonTargetLowering::createExtendingPartialReduceMLA( + unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType, + const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio, + const SDLoc &DL, SelectionDAG &DAG) const { + const auto &Subtarget = DAG.getSubtarget(); + if (!Subtarget.useHVXOps()) + return SDValue(); + + EVT InputEltType = InputType.getVectorElementType(); + + // Find if an optimized instruction for the sub-reduction is available. + unsigned NativeRatio; + if (AccEltType == MVT::i32 && InputEltType == MVT::i8) + NativeRatio = 4; + else + return SDValue(); + + // We only handle the case when additional reduction will be needed, i.e. + // input is longer by a larger factor than the result. + ElementCount InputEC = InputType.getVectorElementCount(); + if (!InputEC.isKnownMultipleOf(AccNumElements * NativeRatio)) + return SDValue(); + + unsigned InputNumElements = InputEC.getFixedValue(); + RemainingReductionRatio = InputNumElements / (AccNumElements * NativeRatio); + if (RemainingReductionRatio == 1) + return SDValue(); + + // Create a reduction by the natively supported factor. + EVT IntermediateType = EVT::getVectorVT(*DAG.getContext(), AccEltType, + InputNumElements / NativeRatio); + + SDValue Zero = DAG.getConstant(0, DL, IntermediateType); + return DAG.getNode(Opcode, DL, IntermediateType, Zero, A, B); +} + +static bool DetectExtendingMultiply(const SDValue &N, EVT ScalarType, + unsigned &Opcode, SDValue &A, SDValue &B) { + SDValue Mul = N; + EVT AccType = Mul.getValueType(); // Vector input type after extension. + if (ScalarType != AccType.getVectorElementType()) + return false; + bool swap = false; + if (Mul->getOpcode() != ISD::MUL) + return false; + A = Mul->getOperand(0); + B = Mul->getOperand(1); + if (A.getOpcode() == ISD::ZERO_EXTEND) { + if (B.getOpcode() == ISD::ZERO_EXTEND) + Opcode = ISD::PARTIAL_REDUCE_UMLA; + else if (B.getOpcode() == ISD::SIGN_EXTEND) { + swap = true; + Opcode = ISD::PARTIAL_REDUCE_SUMLA; + } else + return false; + } else if (A.getOpcode() == ISD::SIGN_EXTEND) { + if (B.getOpcode() == ISD::ZERO_EXTEND) + Opcode = ISD::PARTIAL_REDUCE_SUMLA; + else if (B.getOpcode() == ISD::SIGN_EXTEND) + Opcode = ISD::PARTIAL_REDUCE_SMLA; + else + return false; + } else + return false; + + // Get multiplication arguments before extension. + A = A->getOperand(0); + B = B->getOperand(0); + if (A.getValueType() != B.getValueType()) + return false; + + if (swap) + std::swap(A, B); + + return true; +} + +SDValue HexagonTargetLowering::splitVecReduceAdd(SDNode *N, + SelectionDAG &DAG) const { + if (!Subtarget.useHVXOps()) + return SDValue(); + + EVT ScalarType = N->getValueType(0); + unsigned Opcode; + SDValue A, B; + if (!DetectExtendingMultiply(N->getOperand(0), ScalarType, Opcode, A, B)) + return SDValue(); + + SDLoc DL(N); + unsigned RemainingReductionRatio; + SDValue Partial = + createExtendingPartialReduceMLA(Opcode, ScalarType, 1, A.getValueType(), + A, B, RemainingReductionRatio, DL, DAG); + if (!Partial) + return SDValue(); + + // We could have inserted a trivial MLA and rely on the folding action, + // similar to how vector_partial_reduce_add is lowered to an MLA in + // SelectionDAGBuilder. However, we just replace the final result since we + // have analyzed the input completely. + return DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarType, Partial); +} + +// When possible, separate an MLA reduction with extended operands but +// unsupported reduction factor into an extending partial reduction that +// can be efficiently lowered, and a follow-up partial reduction. +// partial_reduce_mla(a, x, y) -> +// partial_reduce_mla(a, partial_reduce_mla(0, x, y), 1) +SDValue +HexagonTargetLowering::splitExtendingPartialReduceMLA(SDNode *N, + SelectionDAG &DAG) const { + if (!Subtarget.useHVXOps()) + return SDValue(); + + SDValue Acc = N->getOperand(0); + SDValue A = N->getOperand(1); + SDValue B = N->getOperand(2); + if (A.getValueType() != B.getValueType()) + return SDValue(); + + // The types should be declared as custom, but do not split already legal + // operation. + EVT AccType = Acc.getValueType(); + EVT InputType = A.getValueType(); + if (getPartialReduceMLAAction(N->getOpcode(), AccType, InputType) != Custom) + return SDValue(); + + SDLoc DL(N); + unsigned RemainingReductionRatio; + SDValue Partial = createExtendingPartialReduceMLA( + N->getOpcode(), AccType.getVectorElementType(), + AccType.getVectorNumElements(), InputType, A, B, RemainingReductionRatio, + DL, DAG); + if (!Partial) + return SDValue(); + assert(RemainingReductionRatio <= MaxExpandMLA); + + // Create the reduction for the remaining ratio. + EVT IntermediateType = Partial->getOperand(0).getValueType(); + SDValue One = DAG.getConstant(1, DL, IntermediateType); + return DAG.getNode(N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA + ? ISD::PARTIAL_REDUCE_UMLA + : ISD::PARTIAL_REDUCE_SUMLA, + DL, AccType, Acc, Partial, One); +} + +SDValue +HexagonTargetLowering::LowerHvxPartialReduceMLA(SDValue Op, + SelectionDAG &DAG) const { + const SDLoc &DL(Op); + SDValue Acc = Op.getOperand(0); + SDValue A = Op.getOperand(1); + SDValue B = Op.getOperand(2); + + // Split the input vectors into units of one HVX vector length. + unsigned HwVectorSizeInBits = Subtarget.getVectorLength() * 8; + + EVT AccType = Acc.getValueType(); + EVT AccEltType = AccType.getVectorElementType(); + unsigned AccSubvectorNumElements = + HwVectorSizeInBits / AccEltType.getSizeInBits(); + EVT AccSubvectorType = + EVT::getVectorVT(*DAG.getContext(), AccEltType, AccSubvectorNumElements); + + EVT InputType = A.getValueType(); + assert(InputType.getSizeInBits() % HwVectorSizeInBits == 0); + EVT InputEltType = InputType.getVectorElementType(); + unsigned InputSubvectorNumElements = + HwVectorSizeInBits / InputEltType.getSizeInBits(); + EVT InputSubvectorType = EVT::getVectorVT(*DAG.getContext(), InputEltType, + InputSubvectorNumElements); + + unsigned SubvectorNum = InputType.getFixedSizeInBits() / HwVectorSizeInBits; + SmallVector Subvectors; + + for (unsigned I = 0; I != SubvectorNum; ++I) { + SDValue SubvectorAcc = DAG.getExtractSubvector(DL, AccSubvectorType, Acc, + I * AccSubvectorNumElements); + SDValue SubvectorA = DAG.getExtractSubvector(DL, InputSubvectorType, A, + I * InputSubvectorNumElements); + SDValue SubvectorB = DAG.getExtractSubvector(DL, InputSubvectorType, B, + I * InputSubvectorNumElements); + SDValue SubvectorMLA = DAG.getNode(Op.getOpcode(), DL, AccSubvectorType, + SubvectorAcc, SubvectorA, SubvectorB); + Subvectors.push_back(SubvectorMLA); + } + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, AccType, Subvectors); +} + SDValue HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -4039,43 +4301,33 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return SDValue(); switch (Opc) { - case ISD::VSELECT: { - // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0) - SDValue Cond = Ops[0]; - if (Cond->getOpcode() == ISD::XOR) { - SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1); - if (C1->getOpcode() == HexagonISD::QTRUE) - return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]); - } - break; + case HexagonISD::V2Q: + if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) { + if (const auto *C = dyn_cast(Ops[0].getOperand(0))) + return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op)) + : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op)); } - case HexagonISD::V2Q: - if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) { - if (const auto *C = dyn_cast(Ops[0].getOperand(0))) - return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op)) - : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op)); - } - break; - case HexagonISD::Q2V: - if (Ops[0].getOpcode() == HexagonISD::QTRUE) - return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op), - DAG.getAllOnesConstant(dl, MVT::i32)); - if (Ops[0].getOpcode() == HexagonISD::QFALSE) - return getZero(dl, ty(Op), DAG); - break; - case HexagonISD::VINSERTW0: - if (isUndef(Ops[1])) - return Ops[0]; - break; - case HexagonISD::VROR: { - if (Ops[0].getOpcode() == HexagonISD::VROR) { - SDValue Vec = Ops[0].getOperand(0); - SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1); - SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1}); - return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot}); - } - break; + break; + case HexagonISD::Q2V: + if (Ops[0].getOpcode() == HexagonISD::QTRUE) + return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op), + DAG.getAllOnesConstant(dl, MVT::i32)); + if (Ops[0].getOpcode() == HexagonISD::QFALSE) + return getZero(dl, ty(Op), DAG); + break; + case HexagonISD::VINSERTW0: + if (isUndef(Ops[1])) + return Ops[0]; + break; + case HexagonISD::VROR: { + if (Ops[0].getOpcode() == HexagonISD::VROR) { + SDValue Vec = Ops[0].getOperand(0); + SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1); + SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1}); + return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot}); } + break; + } } return SDValue(); diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td index c6ea6d02bb5d..a4b8d895672a 100644 --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -456,6 +456,13 @@ let Predicates = [UseHVX] in { (VShuff (V6_vmpyhus_acc (VDeal $Vx, -4), HVI16:$Vs, HVI16:$Vt), -4)>; } + + def : Pat<(VecI32 (partial_reduce_umla VecI32:$Acc, HVI8:$A, HVI8:$B)), + (V6_vrmpyubv_acc $Acc, $A, $B)>; + def : Pat<(VecI32 (partial_reduce_smla VecI32:$Acc, HVI8:$A, HVI8:$B)), + (V6_vrmpybv_acc $Acc, $A, $B)>; + def : Pat<(VecI32 (partial_reduce_sumla VecI32:$Acc, HVI8:$A, HVI8:$B)), + (V6_vrmpybusv_acc $Acc, $B, $A)>; } let Predicates = [UseHVX] in { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index 25ede7d26254..c2e9b3527e1e 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -327,6 +327,14 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost( return 1; } +bool HexagonTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const { + switch (II->getIntrinsicID()) { + case Intrinsic::vector_reduce_add: + return false; + } + return true; +} + bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/, unsigned /*AddressSpace*/, TTI::MaskKind /*MaskKind*/) const { diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 0bd07a97ff3d..25e1b5b5f645 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -156,7 +156,7 @@ public: const Instruction *I = nullptr) const override { return 1; } - + bool shouldExpandReduction(const IntrinsicInst *II) const override; bool isLegalMaskedStore(Type *DataType, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind) const override; diff --git a/llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll b/llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll new file mode 100644 index 000000000000..cd64099faebb --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll @@ -0,0 +1,143 @@ +; RUN: llc -mtriple=hexagon < %s | FileCheck %s + +target triple = "hexagon" + +define i32 @add_v32i32(<32 x i32> %vec) #0 { +; CHECK-LABEL: add_v32i32: +; CHECK: { +; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R8:v[0-9]+]] = valign([[_:v[0-9]+]],[[R7]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R9:v[0-9]+]].w = vadd([[R7]].w,[[R8]].w) +; CHECK: } +; CHECK: { +; CHECK: r0 = vextract([[R9]],{{.+}}) +; CHECK: } +entry: + %r = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %vec) + ret i32 %r +} + +define i32 @add_v16i32(<16 x i32> %vec) #0 { +; CHECK-LABEL: add_v16i32: +; CHECK: { +; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w) +; CHECK: } +; CHECK: { +; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}}) +; CHECK: } +; CHECK: { +; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w) +; CHECK: } +; CHECK: { +; CHECK: r0 = vextract([[R7]],{{.+}}) +; CHECK: } +entry: + %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %vec) + ret i32 %r +} + +define i32 @add_v8i32(<8 x i32> %vec) #0 { +; CHECK-LABEL: add_v8i32: +; CHECK: { +; CHECK: r[[RS1:[0-9]+:[0-9]+]] = vaddw(r1:0,r5:4) +; CHECK: r[[R6:[0-9]+:[0-9]+]] = memd(r29+#0) +; CHECK: } +; CHECK: { +; CHECK: r[[RS2:[0-9]+:[0-9]+]] = vaddw(r3:2,r[[R6]]) +; CHECK: } +; CHECK: { +; CHECK: r[[RS3:[0-9]+:[0-9]+]] = vaddw(r[[RS1]],r[[RS2]]) +; CHECK: } +; CHECK: { +;; TODO: combine and double register add can be optimized to single register add. +; CHECK: r[[RS4:[0-9]+:[0-9]+]] = combine(#0,r{{[0-9]+}}) +; CHECK: } +; CHECK: { +; CHECK: r1:0 = vaddw(r[[RS3]],r[[RS4]]) +entry: + %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %vec) + ret i32 %r +} + +define i32 @add_v64i32(<64 x i32> %vec) #0 { +; CHECK-LABEL: add_v64i32: +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +entry: + %r = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %vec) + ret i32 %r +} + +;; Non-pow2 vectors are scalarized. + +define i32 @add_v12i32(<12 x i32> %vec) #0 { +; CHECK-LABEL: add_v12i32: +; CHECK: [[RS0:r[0-9]+]] = add(r0,r1) +; CHECK: [[RS1:r[0-9]+]] += add([[RS0]],r{{[0-9]+}}) +; CHECK: [[RS2:r[0-9]+]] += add([[RS1]],r{{[0-9]+}}) +; CHECK: [[RS3:r[0-9]+]] += add([[RS2]],r{{[0-9]+}}) +; CHECK: [[RS4:r[0-9]+]] += add([[RS3]],r{{[0-9]+}}) +; CHECK: [[RS5:r[0-9]+]] += add([[RS4]],r{{[0-9]+}}) +entry: + %r = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> %vec) + ret i32 %r +} + +define i32 @add_v3i32(<3 x i32> %vec) #0 { +; CHECK-LABEL: add_v3i32: +; CHECK: r{{[0-9]+}} += add(r{{[0-9]+}},r{{[0-9]+}}) +entry: + %r = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %vec) + ret i32 %r +} + +attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" } diff --git a/llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll b/llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll new file mode 100644 index 000000000000..c49b7305725b --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll @@ -0,0 +1,145 @@ +; RUN: llc -mtriple=hexagon < %s | FileCheck %s + +define i32 @full_reduce_i32_128i8_uu(<128 x i8> %x, <128 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_128i8_uu: +; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]]) +; CHECK: [[A]].uw += vrmpy(v0.ub,v1.ub) +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: vextract + %x.wide = zext <128 x i8> %x to <128 x i32> + %y.wide = zext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m) + ret i32 %reduce +} + +define i32 @full_reduce_i32_128i8_su(<128 x i8> %x, <128 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_128i8_su: +; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]]) +; CHECK: [[A]].w += vrmpy(v1.ub,v0.b) +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: vextract + %x.wide = sext <128 x i8> %x to <128 x i32> + %y.wide = zext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m) + ret i32 %reduce +} + +define i32 @full_reduce_i32_128i8_us(<128 x i8> %x, <128 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_128i8_us: +; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]]) +; CHECK: [[A]].w += vrmpy(v0.ub,v1.b) +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: vextract + %x.wide = zext <128 x i8> %x to <128 x i32> + %y.wide = sext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m) + ret i32 %reduce +} + +define i32 @full_reduce_i32_128i8_ss(<128 x i8> %x, <128 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_128i8_ss: +; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]]) +; CHECK: [[A]].w += vrmpy(v0.b,v1.b) +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: vextract + %x.wide = sext <128 x i8> %x to <128 x i32> + %y.wide = sext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m) + ret i32 %reduce +} + +;; Double-vector input. + +define i32 @full_reduce_i32_256i8(<256 x i8> %x, <256 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_256i8: +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd + %x.wide = zext <256 x i8> %x to <256 x i32> + %y.wide = zext <256 x i8> %y to <256 x i32> + %m = mul nuw nsw <256 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v256i32(<256 x i32> %m) + ret i32 %reduce +} + +;; Maximum handled vector size. + +define i32 @full_reduce_i32_1024i8(<1024 x i8> %x, <1024 x i8> %y) #0 { +; CHECK-LABEL: full_reduce_i32_1024i8: +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vrmpy +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd +; CHECK: valign +; CHECK: vadd + %x.wide = zext <1024 x i8> %x to <1024 x i32> + %y.wide = zext <1024 x i8> %y to <1024 x i32> + %m = mul nuw nsw <1024 x i32> %x.wide, %y.wide + %reduce = tail call i32 @llvm.vector.reduce.add.v1024i32(<1024 x i32> %m) + ret i32 %reduce +} + +attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" } diff --git a/llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll b/llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll new file mode 100644 index 000000000000..b2c2c1893ea4 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll @@ -0,0 +1,162 @@ +;; Check HVX vectorization. +; RUN: llc -mtriple hexagon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-HVX + +;; Check that there is no failure when compiling to scalar code, don't check the output. +; RUN: llc -mtriple hexagon -mattr=-hvx,-hvxv73,-hvx-length128b < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-HVX + +define <16 x i32> @partial_reduce_uu_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 { +; CHECK-LABEL: partial_reduce_uu_64: +; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <64 x i8> %x to <64 x i32> + %y.ext = zext <64 x i8> %y to <64 x i32> + %m = mul nuw nsw <64 x i32> %x.ext, %y.ext + %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m) + ret <16 x i32> %partial.reduce +} + +define <16 x i32> @partial_reduce_su_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 { +; CHECK-LABEL: partial_reduce_su_64: +; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = sext <64 x i8> %x to <64 x i32> + %y.ext = zext <64 x i8> %y to <64 x i32> + %m = mul nuw nsw <64 x i32> %x.ext, %y.ext + %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m) + ret <16 x i32> %partial.reduce +} + +define <16 x i32> @partial_reduce_us_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 { +; CHECK-LABEL: partial_reduce_us_64: +; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <64 x i8> %x to <64 x i32> + %y.ext = sext <64 x i8> %y to <64 x i32> + %m = mul nuw nsw <64 x i32> %x.ext, %y.ext + %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m) + ret <16 x i32> %partial.reduce +} + +define <16 x i32> @partial_reduce_ss_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 { +; CHECK-LABEL: partial_reduce_ss_64: +; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = sext <64 x i8> %x to <64 x i32> + %y.ext = sext <64 x i8> %y to <64 x i32> + %m = mul nuw nsw <64 x i32> %x.ext, %y.ext + %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m) + ret <16 x i32> %partial.reduce +} + +define <32 x i32> @partial_reduce_uu_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_uu_128: +; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <128 x i8> %x to <128 x i32> + %y.ext = zext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m) + ret <32 x i32> %partial.reduce +} + +define <32 x i32> @partial_reduce_su_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_su_128: +; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = sext <128 x i8> %x to <128 x i32> + %y.ext = zext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m) + ret <32 x i32> %partial.reduce +} + +define <32 x i32> @partial_reduce_us_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_us_128: +; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <128 x i8> %x to <128 x i32> + %y.ext = sext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m) + ret <32 x i32> %partial.reduce +} + +define <32 x i32> @partial_reduce_ss_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_ss_128: +; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = sext <128 x i8> %x to <128 x i32> + %y.ext = sext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m) + ret <32 x i32> %partial.reduce +} + +;; Multiple-size inputs, same output size. +define <32 x i32> @partial_reduce_uu_32xi32_256xi8(<32 x i32> %acc, <256 x i8> %x, <256 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_uu_32xi32_256xi8: +; CHECK-HVX: [[R1:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub) +; CHECK-HVX: [[R2:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub) +; CHECK-HVX: [[R3:v[0-9]+]].w = vadd(v0.w,[[R1]].w) +; CHECK-HVX: v0.w = vadd([[R2]].w,[[R3]].w) +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <256 x i8> %x to <256 x i32> + %y.ext = zext <256 x i8> %y to <256 x i32> + %m = mul nuw nsw <256 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v256i32(<32 x i32> %acc, <256 x i32> %m) + ret <32 x i32> %partial.reduce +} + +define <32 x i32> @partial_reduce_uu_32xi32_1024xi8(<32 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_uu_32xi32_1024xi8: +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-HVX-DAG: vrmpy +; CHECK-HVX-DAG: vadd +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi + %x.ext = zext <1024 x i8> %x to <1024 x i32> + %y.ext = zext <1024 x i8> %y to <1024 x i32> + %m = mul nuw nsw <1024 x i32> %x.ext, %y.ext + %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<32 x i32> %acc, <1024 x i32> %m) + ret <32 x i32> %partial.reduce +} + +define <256 x i32> @partial_reduce_uu_64xi32_1024xi8(<256 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_uu_64xi32_1024xi8: +; CHECK-HVX-COUNT-8: vrmpy +; CHECK-HVX-NOT: vadd +; CHECK-NO-HVX: {{r[0-9]+}} += mpyi +; CHECK-HVX: dealloc_return + %x.ext = zext <1024 x i8> %x to <1024 x i32> + %y.ext = zext <1024 x i8> %y to <1024 x i32> + %m = mul nuw nsw <1024 x i32> %x.ext, %y.ext + %partial.reduce = tail call <256 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<256 x i32> %acc, <1024 x i32> %m) + ret <256 x i32> %partial.reduce +} + +;; Check for vector size that do not match an available vrmpy (2x reduction). +define <64 x i32> @partial_reduce_unsupported(<64 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 { +; CHECK-LABEL: partial_reduce_unsupported: +; CHECK-HVX: vmpy +; CHECK-HVX: vadd + %x.ext = zext <128 x i8> %x to <128 x i32> + %y.ext = zext <128 x i8> %y to <128 x i32> + %m = mul nuw nsw <128 x i32> %x.ext, %y.ext + %partial.reduce = tail call <64 x i32> @llvm.vector.partial.reduce.add.v64i32.v128i32(<64 x i32> %acc, <128 x i32> %m) + ret <64 x i32> %partial.reduce +} + +attributes #0 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" } +attributes #1 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length128b" }