[Hexagon] Support partial reduction intrinsics (#179797)

This commit has changes necessary for using vrmpy instructions in full and partial multiply/add reductions on extended arguments. There are three main parts: - partial reduction operations PARTIAL_REDUCE_(U|S|SU)MLA are lowered to accumulating vrmpy, including native and multiples of native vector sizes; - full and partial reductions can be "split" into an inner partial reduction and a residual full or partial reduction. The inner reduction will be lowered to vrmpy due to the first change; - vecreduce_add expansion is moved to Hexagon backend from a generic pass, accompanied by a set of tests. In addition, there is a minor cleanup in HexagonTargetLowering::PerformDAGCombine().
2026-02-17 14:56:05 -05:00 · 2026-02-17 14:56:05 -05:00 · 98d8b69dfc
commit 98d8b69dfc
parent 42618de278
9 changed files with 884 additions and 60 deletions
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@ -1506,6 +1506,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
  MaxStoresPerMemset = 8;
  MaxStoresPerMemsetOptSize = 4;

+  setTargetDAGCombine(ISD::VECREDUCE_ADD);
+
  //
  // Set up register classes.
  //
@ -3413,16 +3415,50 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
 SDValue
 HexagonTargetLowering::PerformDAGCombine(SDNode *N,
                                         DAGCombinerInfo &DCI) const {
+  SDValue Op(N, 0);
+  const SDLoc &dl(Op);
+  unsigned Opc = Op.getOpcode();
+
+  // Combining transformations applicable for arbitrary vector sizes.
+  if (DCI.isBeforeLegalizeOps()) {
+    switch (Opc) {
+    case ISD::VECREDUCE_ADD:
+      if (SDValue V = splitVecReduceAdd(N, DCI.DAG))
+        return V;
+      if (SDValue V = expandVecReduceAdd(N, DCI.DAG))
+        return V;
+      return SDValue();
+    case ISD::PARTIAL_REDUCE_SMLA:
+    case ISD::PARTIAL_REDUCE_UMLA:
+    case ISD::PARTIAL_REDUCE_SUMLA:
+      if (SDValue V = splitExtendingPartialReduceMLA(N, DCI.DAG))
+        return V;
+      return SDValue();
+    }
+  } else {
+    switch (Opc) {
+    case ISD::VSELECT: {
+      // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
+      SDValue Cond = Op.getOperand(0);
+      if (Cond->getOpcode() == ISD::XOR) {
+        SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
+        if (C1->getOpcode() == HexagonISD::PTRUE) {
+          SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
+                                         Op.getOperand(2), Op.getOperand(1));
+          return VSel;
+        }
+      }
+      return SDValue();
+    }
+    }
+  }
+
  if (isHvxOperation(N, DCI.DAG)) {
    if (SDValue V = PerformHvxDAGCombine(N, DCI))
      return V;
    return SDValue();
  }

-  SDValue Op(N, 0);
-  const SDLoc &dl(Op);
-  unsigned Opc = Op.getOpcode();
-
  if (Opc == ISD::TRUNCATE) {
    SDValue Op0 = Op.getOperand(0);
    // fold (truncate (build pair x, y)) -> (truncate x) or x
@ -3441,7 +3477,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
  if (DCI.isBeforeLegalizeOps())
    return SDValue();

-  if (Opc == HexagonISD::P2D) {
+  switch (Opc) {
+  case HexagonISD::P2D: {
    SDValue P = Op.getOperand(0);
    switch (P.getOpcode()) {
    case HexagonISD::PTRUE:
@ -3451,20 +3488,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
    default:
      break;
    }
-  } else if (Opc == ISD::VSELECT) {
-    // This is pretty much duplicated in HexagonISelLoweringHVX...
-    //
-    // (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
-    SDValue Cond = Op.getOperand(0);
-    if (Cond->getOpcode() == ISD::XOR) {
-      SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
-      if (C1->getOpcode() == HexagonISD::PTRUE) {
-        SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
-                                       Op.getOperand(2), Op.getOperand(1));
-        return VSel;
-      }
-    }
-  } else if (Opc == ISD::TRUNCATE) {
+    break;
+  }
+  case ISD::TRUNCATE: {
    SDValue Op0 = Op.getOperand(0);
    // fold (truncate (build pair x, y)) -> (truncate x) or x
    if (Op0.getOpcode() == ISD::BUILD_PAIR) {
@ -3477,7 +3503,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
      if (ty(Elem0).bitsGT(TruncTy))
        return DCI.DAG.getNode(ISD::TRUNCATE, dl, TruncTy, Elem0);
    }
-  } else if (Opc == ISD::OR) {
+    break;
+  }
+  case ISD::OR: {
    // fold (or (shl xx, s), (zext y)) -> (COMBINE (shl xx, s-32), y)
    // if s >= 32
    auto fold0 = [&, this](SDValue Op) {
@ -3507,6 +3535,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,

    if (SDValue R = fold0(Op))
      return R;
+    break;
+  }
  }

  return SDValue();
@ -3750,6 +3780,78 @@ EVT HexagonTargetLowering::getOptimalMemOpType(
  return MVT::Other;
 }

+// The helpers below are versions of llvm::getShuffleReduction and
+// llvm::getOrderedReduction, adapted to use during DAG passes and simplified as
+// follows:
+// - ICmp and FCmp are not handled;
+// - in every step in getShuffleReduction, the input is split into halves (not
+// pairwise).
+
+static SDValue getOrderedReduction(SDValue Vec, unsigned Op,
+                                   SelectionDAG &DAG) {
+  assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
+
+  EVT VT = Vec.getValueType();
+  EVT EltT = VT.getVectorElementType();
+  unsigned VF = VT.getVectorNumElements();
+  assert(VF > 0 &&
+         "Reduction emission only supported for non-zero length vectors!");
+
+  SDLoc DL(Vec);
+  SDValue Result = DAG.getExtractVectorElt(DL, EltT, Vec, 0);
+  for (unsigned ExtractIdx = 1; ExtractIdx < VF; ++ExtractIdx) {
+    SDValue Ext = DAG.getExtractVectorElt(DL, EltT, Vec, ExtractIdx);
+    Result = DAG.getNode(Op, DL, EltT, {Result, Ext});
+  }
+
+  return Result;
+}
+
+static SDValue getShuffleReduction(SDValue Vec, unsigned Op,
+                                   SelectionDAG &DAG) {
+  assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
+
+  EVT VT = Vec.getValueType();
+  unsigned VF = VT.getVectorNumElements();
+  if (VF == 0)
+    llvm_unreachable("Vector must be non-zero length");
+  // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+  // and vector ops, reducing the set of values being computed by half each
+  // round.
+  assert(isPowerOf2_32(VF) &&
+         "Reduction emission only supported for pow2 vectors!");
+
+  SDLoc DL(Vec);
+  // TODO: Is it correct to create double-vector shuffle and fill 3/4 of it with
+  // undefs?
+  SmallVector<int, 32> ShuffleMask(VF);
+  for (unsigned i = VF; i > 1; i >>= 1) {
+    // Move the upper half of the vector to the lower half.
+    for (unsigned j = 0; j != i / 2; ++j)
+      ShuffleMask[j] = i / 2 + j;
+    // Fill the rest of the mask with undef.
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
+
+    SDValue Shuf =
+        DAG.getVectorShuffle(VT, DL, Vec, DAG.getUNDEF(VT), ShuffleMask);
+
+    Vec = DAG.getNode(Op, DL, VT, {Vec, Shuf});
+  }
+  // The result is in the first element of the vector.
+  return DAG.getExtractVectorElt(DL, VT.getVectorElementType(), Vec, 0);
+}
+
+SDValue HexagonTargetLowering::expandVecReduceAdd(SDNode *N,
+                                                  SelectionDAG &DAG) const {
+  // Since we disabled automatic reduction expansion, generate log2 ladder code
+  // if the vector is of a power-of-two length.
+  SDValue Input = N->getOperand(0);
+  if (isPowerOf2_32(Input.getValueType().getVectorNumElements()))
+    return getShuffleReduction(Input, ISD::ADD, DAG);
+  // Otherwise, reduction will be scalarized.
+  return getOrderedReduction(Input, ISD::ADD, DAG);
+}
+
 bool HexagonTargetLowering::allowsMemoryAccess(
    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
    Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const {
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@ -485,6 +485,7 @@ private:
  SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerHvxPred32ToFp(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerHvxPred64ToFp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxPartialReduceMLA(SDValue Op, SelectionDAG &DAG) const;
  SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
  SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
  SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
@ -519,10 +520,14 @@ private:
  SDValue combineTruncateBeforeLegal(SDValue Op, DAGCombinerInfo &DCI) const;
  SDValue combineConcatVectorsBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
      const;
-  SDValue combineVectorShuffleBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
-      const;
-
-  SDValue PerformHvxDAGCombine(SDNode * N, DAGCombinerInfo & DCI) const;
+  SDValue expandVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
+  SDValue createExtendingPartialReduceMLA(
+      unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
+      const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
+      const SDLoc &DL, SelectionDAG &DAG) const;
+  SDValue splitVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
+  SDValue splitExtendingPartialReduceMLA(SDNode *N, SelectionDAG &DAG) const;
+  SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 };

 } // end namespace llvm
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@ -40,6 +40,8 @@ static const MVT LegalW64[] =  { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16,  MVT::v32i32 };
 static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };

+static const unsigned MaxExpandMLA = 8;
+
 static std::tuple<unsigned, unsigned, unsigned> getIEEEProperties(MVT Ty) {
  // For a float scalar type, return (exp-bits, exp-bias, fraction-bits)
  MVT ElemTy = Ty.getScalarType();
@ -504,6 +506,69 @@ HexagonTargetLowering::initializeHVXLowering() {
  setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom);

  setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT});
+
+  setTargetDAGCombine({ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA,
+                       ISD::PARTIAL_REDUCE_SUMLA});
+
+  // Partial MLA reductions.
+  {
+    static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
+                                      ISD::PARTIAL_REDUCE_UMLA,
+                                      ISD::PARTIAL_REDUCE_SUMLA};
+
+    auto HvxType = [=](MVT ScalarT, unsigned Factor = 1) {
+      return MVT::getVectorVT(ScalarT, Subtarget.getVectorLength() * Factor *
+                                           8 / ScalarT.getSizeInBits());
+    };
+
+    // Tuple of (Acc element type, input element type, vector pair).
+    // The assumption is both the input and reduction result are of the same
+    // size so the reduction ratio is the same as the ratio of element type
+    // sizes. This may not hold for all available instructions.
+    typedef std::tuple<MVT, MVT, bool> ReductionSignature;
+
+    static const std::vector<ReductionSignature> NativeReductions = {
+        {MVT::i32, MVT::i8, false},
+    };
+
+    for (const auto &R : NativeReductions) {
+
+      MVT AccType = std::get<0>(R);
+      MVT InputType = std::get<1>(R);
+      unsigned Factor = std::get<2>(R) ? 2 : 1;
+
+      // The native size is legal.
+      setPartialReduceMLAAction(MLAOps, HvxType(AccType), HvxType(InputType),
+                                Legal);
+
+      // Allow custom partial MLA reductions on larger vectors than legally
+      // supported. These reduction must be declared as Custom (or Legal)
+      // for foldPartialReduceMLAMulOp() to fold the multiply by one pattern
+      // inserted when the partial reduction intrinsic is converted to
+      // PARTIAL_REDUCE_U/S/SUMLA. Otherwise, the Split action will apply
+      // on the original pattern, including the extensions and multiplies,
+      // which will make it impossible to match.
+      // There are two independent ways to extend the
+      // input size: 1. to concatenate the result - output vector is
+      // proportionally extended, 2) to reduce the result - the output vector
+      // size stays the same. We limit allowed combinations so that the total
+      // number of generated reduction instructions is limited by a constant
+      // number. This limit is arbitrary and can be revised. On one hand, it is
+      // convenient to have more choices; on the other hand, there is a
+      // diminishing benefit of very long sequences, which should probably be
+      // written as loops instead.
+      for (unsigned ConcatFactor = 1; ConcatFactor <= MaxExpandMLA;
+           ConcatFactor <<= 1)
+        for (unsigned ReductionFactor = 1; ReductionFactor <= MaxExpandMLA;
+             ReductionFactor <<= 1)
+          if (ConcatFactor * ReductionFactor != 1 &&
+              ConcatFactor * ReductionFactor <= MaxExpandMLA)
+            setPartialReduceMLAAction(
+                MLAOps, HvxType(AccType, Factor * ConcatFactor),
+                HvxType(InputType, Factor * ConcatFactor * ReductionFactor),
+                Custom);
+    }
+  }
 }

 unsigned
@ -3678,6 +3743,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
    case HexagonISD::SMUL_LOHI:
    case HexagonISD::UMUL_LOHI:
    case HexagonISD::USMUL_LOHI:       return LowerHvxMulLoHi(Op, DAG);
+
+    case ISD::PARTIAL_REDUCE_SMLA:
+    case ISD::PARTIAL_REDUCE_UMLA:
+    case ISD::PARTIAL_REDUCE_SUMLA:
+      return LowerHvxPartialReduceMLA(Op, DAG);
      // clang-format on
  }
 #ifndef NDEBUG
@ -4020,6 +4090,198 @@ HexagonTargetLowering::combineConcatVectorsBeforeLegal(
  return DAG.getVectorShuffle(LongTy, dl, Cat, DAG.getUNDEF(LongTy), LongMask);
 }

+// Create the inner partial reduction MLA that can be efficiently lowered. This
+// function is used by partial and full reductions.
+SDValue HexagonTargetLowering::createExtendingPartialReduceMLA(
+    unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
+    const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
+    const SDLoc &DL, SelectionDAG &DAG) const {
+  const auto &Subtarget = DAG.getSubtarget<HexagonSubtarget>();
+  if (!Subtarget.useHVXOps())
+    return SDValue();
+
+  EVT InputEltType = InputType.getVectorElementType();
+
+  // Find if an optimized instruction for the sub-reduction is available.
+  unsigned NativeRatio;
+  if (AccEltType == MVT::i32 && InputEltType == MVT::i8)
+    NativeRatio = 4;
+  else
+    return SDValue();
+
+  // We only handle the case when additional reduction will be needed, i.e.
+  // input is longer by a larger factor than the result.
+  ElementCount InputEC = InputType.getVectorElementCount();
+  if (!InputEC.isKnownMultipleOf(AccNumElements * NativeRatio))
+    return SDValue();
+
+  unsigned InputNumElements = InputEC.getFixedValue();
+  RemainingReductionRatio = InputNumElements / (AccNumElements * NativeRatio);
+  if (RemainingReductionRatio == 1)
+    return SDValue();
+
+  // Create a reduction by the natively supported factor.
+  EVT IntermediateType = EVT::getVectorVT(*DAG.getContext(), AccEltType,
+                                          InputNumElements / NativeRatio);
+
+  SDValue Zero = DAG.getConstant(0, DL, IntermediateType);
+  return DAG.getNode(Opcode, DL, IntermediateType, Zero, A, B);
+}
+
+static bool DetectExtendingMultiply(const SDValue &N, EVT ScalarType,
+                                    unsigned &Opcode, SDValue &A, SDValue &B) {
+  SDValue Mul = N;
+  EVT AccType = Mul.getValueType(); // Vector input type after extension.
+  if (ScalarType != AccType.getVectorElementType())
+    return false;
+  bool swap = false;
+  if (Mul->getOpcode() != ISD::MUL)
+    return false;
+  A = Mul->getOperand(0);
+  B = Mul->getOperand(1);
+  if (A.getOpcode() == ISD::ZERO_EXTEND) {
+    if (B.getOpcode() == ISD::ZERO_EXTEND)
+      Opcode = ISD::PARTIAL_REDUCE_UMLA;
+    else if (B.getOpcode() == ISD::SIGN_EXTEND) {
+      swap = true;
+      Opcode = ISD::PARTIAL_REDUCE_SUMLA;
+    } else
+      return false;
+  } else if (A.getOpcode() == ISD::SIGN_EXTEND) {
+    if (B.getOpcode() == ISD::ZERO_EXTEND)
+      Opcode = ISD::PARTIAL_REDUCE_SUMLA;
+    else if (B.getOpcode() == ISD::SIGN_EXTEND)
+      Opcode = ISD::PARTIAL_REDUCE_SMLA;
+    else
+      return false;
+  } else
+    return false;
+
+  // Get multiplication arguments before extension.
+  A = A->getOperand(0);
+  B = B->getOperand(0);
+  if (A.getValueType() != B.getValueType())
+    return false;
+
+  if (swap)
+    std::swap(A, B);
+
+  return true;
+}
+
+SDValue HexagonTargetLowering::splitVecReduceAdd(SDNode *N,
+                                                 SelectionDAG &DAG) const {
+  if (!Subtarget.useHVXOps())
+    return SDValue();
+
+  EVT ScalarType = N->getValueType(0);
+  unsigned Opcode;
+  SDValue A, B;
+  if (!DetectExtendingMultiply(N->getOperand(0), ScalarType, Opcode, A, B))
+    return SDValue();
+
+  SDLoc DL(N);
+  unsigned RemainingReductionRatio;
+  SDValue Partial =
+      createExtendingPartialReduceMLA(Opcode, ScalarType, 1, A.getValueType(),
+                                      A, B, RemainingReductionRatio, DL, DAG);
+  if (!Partial)
+    return SDValue();
+
+  // We could have inserted a trivial MLA and rely on the folding action,
+  // similar to how vector_partial_reduce_add is lowered to an MLA in
+  // SelectionDAGBuilder. However, we just replace the final result since we
+  // have analyzed the input completely.
+  return DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarType, Partial);
+}
+
+// When possible, separate an MLA reduction with extended operands but
+// unsupported reduction factor into an extending partial reduction that
+// can be efficiently lowered, and a follow-up partial reduction.
+// partial_reduce_mla(a, x, y) ->
+//     partial_reduce_mla(a, partial_reduce_mla(0, x, y), 1)
+SDValue
+HexagonTargetLowering::splitExtendingPartialReduceMLA(SDNode *N,
+                                                      SelectionDAG &DAG) const {
+  if (!Subtarget.useHVXOps())
+    return SDValue();
+
+  SDValue Acc = N->getOperand(0);
+  SDValue A = N->getOperand(1);
+  SDValue B = N->getOperand(2);
+  if (A.getValueType() != B.getValueType())
+    return SDValue();
+
+  // The types should be declared as custom, but do not split already legal
+  // operation.
+  EVT AccType = Acc.getValueType();
+  EVT InputType = A.getValueType();
+  if (getPartialReduceMLAAction(N->getOpcode(), AccType, InputType) != Custom)
+    return SDValue();
+
+  SDLoc DL(N);
+  unsigned RemainingReductionRatio;
+  SDValue Partial = createExtendingPartialReduceMLA(
+      N->getOpcode(), AccType.getVectorElementType(),
+      AccType.getVectorNumElements(), InputType, A, B, RemainingReductionRatio,
+      DL, DAG);
+  if (!Partial)
+    return SDValue();
+  assert(RemainingReductionRatio <= MaxExpandMLA);
+
+  // Create the reduction for the remaining ratio.
+  EVT IntermediateType = Partial->getOperand(0).getValueType();
+  SDValue One = DAG.getConstant(1, DL, IntermediateType);
+  return DAG.getNode(N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
+                         ? ISD::PARTIAL_REDUCE_UMLA
+                         : ISD::PARTIAL_REDUCE_SUMLA,
+                     DL, AccType, Acc, Partial, One);
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxPartialReduceMLA(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  const SDLoc &DL(Op);
+  SDValue Acc = Op.getOperand(0);
+  SDValue A = Op.getOperand(1);
+  SDValue B = Op.getOperand(2);
+
+  // Split the input vectors into units of one HVX vector length.
+  unsigned HwVectorSizeInBits = Subtarget.getVectorLength() * 8;
+
+  EVT AccType = Acc.getValueType();
+  EVT AccEltType = AccType.getVectorElementType();
+  unsigned AccSubvectorNumElements =
+      HwVectorSizeInBits / AccEltType.getSizeInBits();
+  EVT AccSubvectorType =
+      EVT::getVectorVT(*DAG.getContext(), AccEltType, AccSubvectorNumElements);
+
+  EVT InputType = A.getValueType();
+  assert(InputType.getSizeInBits() % HwVectorSizeInBits == 0);
+  EVT InputEltType = InputType.getVectorElementType();
+  unsigned InputSubvectorNumElements =
+      HwVectorSizeInBits / InputEltType.getSizeInBits();
+  EVT InputSubvectorType = EVT::getVectorVT(*DAG.getContext(), InputEltType,
+                                            InputSubvectorNumElements);
+
+  unsigned SubvectorNum = InputType.getFixedSizeInBits() / HwVectorSizeInBits;
+  SmallVector<SDValue, MaxExpandMLA> Subvectors;
+
+  for (unsigned I = 0; I != SubvectorNum; ++I) {
+    SDValue SubvectorAcc = DAG.getExtractSubvector(DL, AccSubvectorType, Acc,
+                                                   I * AccSubvectorNumElements);
+    SDValue SubvectorA = DAG.getExtractSubvector(DL, InputSubvectorType, A,
+                                                 I * InputSubvectorNumElements);
+    SDValue SubvectorB = DAG.getExtractSubvector(DL, InputSubvectorType, B,
+                                                 I * InputSubvectorNumElements);
+    SDValue SubvectorMLA = DAG.getNode(Op.getOpcode(), DL, AccSubvectorType,
+                                       SubvectorAcc, SubvectorA, SubvectorB);
+    Subvectors.push_back(SubvectorMLA);
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, AccType, Subvectors);
+}
+
 SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
      const {
@ -4039,43 +4301,33 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
    return SDValue();

  switch (Opc) {
-    case ISD::VSELECT: {
-      // (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
-      SDValue Cond = Ops[0];
-      if (Cond->getOpcode() == ISD::XOR) {
-        SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
-        if (C1->getOpcode() == HexagonISD::QTRUE)
-          return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]);
-      }
-      break;
+  case HexagonISD::V2Q:
+    if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
+      if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
+        return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
+                           : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
    }
-    case HexagonISD::V2Q:
-      if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
-        if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
-          return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
-                             : DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
-      }
-      break;
-    case HexagonISD::Q2V:
-      if (Ops[0].getOpcode() == HexagonISD::QTRUE)
-        return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
-                           DAG.getAllOnesConstant(dl, MVT::i32));
-      if (Ops[0].getOpcode() == HexagonISD::QFALSE)
-        return getZero(dl, ty(Op), DAG);
-      break;
-    case HexagonISD::VINSERTW0:
-      if (isUndef(Ops[1]))
-        return Ops[0];
-      break;
-    case HexagonISD::VROR: {
-      if (Ops[0].getOpcode() == HexagonISD::VROR) {
-        SDValue Vec = Ops[0].getOperand(0);
-        SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
-        SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
-        return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
-      }
-      break;
+    break;
+  case HexagonISD::Q2V:
+    if (Ops[0].getOpcode() == HexagonISD::QTRUE)
+      return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
+                         DAG.getAllOnesConstant(dl, MVT::i32));
+    if (Ops[0].getOpcode() == HexagonISD::QFALSE)
+      return getZero(dl, ty(Op), DAG);
+    break;
+  case HexagonISD::VINSERTW0:
+    if (isUndef(Ops[1]))
+      return Ops[0];
+    break;
+  case HexagonISD::VROR: {
+    if (Ops[0].getOpcode() == HexagonISD::VROR) {
+      SDValue Vec = Ops[0].getOperand(0);
+      SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
+      SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
+      return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
    }
+    break;
+  }
  }

  return SDValue();
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@ -456,6 +456,13 @@ let Predicates = [UseHVX] in {
              (VShuff (V6_vmpyhus_acc (VDeal $Vx, -4),
                                      HVI16:$Vs, HVI16:$Vt), -4)>;
  }
+
+  def : Pat<(VecI32 (partial_reduce_umla VecI32:$Acc, HVI8:$A, HVI8:$B)),
+            (V6_vrmpyubv_acc $Acc, $A, $B)>;
+  def : Pat<(VecI32 (partial_reduce_smla VecI32:$Acc, HVI8:$A, HVI8:$B)),
+            (V6_vrmpybv_acc $Acc, $A, $B)>;
+  def : Pat<(VecI32 (partial_reduce_sumla VecI32:$Acc, HVI8:$A, HVI8:$B)),
+            (V6_vrmpybusv_acc $Acc, $B, $A)>;
 }

 let Predicates = [UseHVX] in {
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@ -327,6 +327,14 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(
  return 1;
 }

+bool HexagonTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::vector_reduce_add:
+    return false;
+  }
+  return true;
+}
+
 bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
                                        unsigned /*AddressSpace*/,
                                        TTI::MaskKind /*MaskKind*/) const {
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -156,7 +156,7 @@ public:
                 const Instruction *I = nullptr) const override {
    return 1;
  }
-
+  bool shouldExpandReduction(const IntrinsicInst *II) const override;
  bool isLegalMaskedStore(Type *DataType, Align Alignment,
                          unsigned AddressSpace,
                          TTI::MaskKind MaskKind) const override;
--- a/llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll
+++ b/llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll
@ -0,0 +1,143 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+target triple = "hexagon"
+
+define i32 @add_v32i32(<32 x i32> %vec) #0 {
+; CHECK-LABEL: add_v32i32:
+; CHECK: {
+; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R8:v[0-9]+]] = valign([[_:v[0-9]+]],[[R7]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R9:v[0-9]+]].w = vadd([[R7]].w,[[R8]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: r0 = vextract([[R9]],{{.+}})
+; CHECK: }
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %vec)
+  ret i32 %r
+}
+
+define i32 @add_v16i32(<16 x i32> %vec) #0 {
+; CHECK-LABEL: add_v16i32:
+; CHECK: {
+; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
+; CHECK: }
+; CHECK: {
+; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
+; CHECK: }
+; CHECK: {
+; CHECK: r0 = vextract([[R7]],{{.+}})
+; CHECK: }
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %vec)
+  ret i32 %r
+}
+
+define i32 @add_v8i32(<8 x i32> %vec) #0 {
+; CHECK-LABEL: add_v8i32:
+; CHECK: {
+; CHECK: r[[RS1:[0-9]+:[0-9]+]] = vaddw(r1:0,r5:4)
+; CHECK: r[[R6:[0-9]+:[0-9]+]] = memd(r29+#0)
+; CHECK: }
+; CHECK: {
+; CHECK: r[[RS2:[0-9]+:[0-9]+]] = vaddw(r3:2,r[[R6]])
+; CHECK: }
+; CHECK: {
+; CHECK: r[[RS3:[0-9]+:[0-9]+]] = vaddw(r[[RS1]],r[[RS2]])
+; CHECK: }
+; CHECK: {
+;; TODO: combine and double register add can be optimized to single register add.
+; CHECK: r[[RS4:[0-9]+:[0-9]+]] = combine(#0,r{{[0-9]+}})
+; CHECK: }
+; CHECK: {
+; CHECK: r1:0 = vaddw(r[[RS3]],r[[RS4]])
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %vec)
+  ret i32 %r
+}
+
+define i32 @add_v64i32(<64 x i32> %vec) #0 {
+; CHECK-LABEL: add_v64i32:
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %vec)
+  ret i32 %r
+}
+
+;; Non-pow2 vectors are scalarized.
+
+define i32 @add_v12i32(<12 x i32> %vec) #0 {
+; CHECK-LABEL: add_v12i32:
+; CHECK: [[RS0:r[0-9]+]] = add(r0,r1)
+; CHECK: [[RS1:r[0-9]+]] += add([[RS0]],r{{[0-9]+}})
+; CHECK: [[RS2:r[0-9]+]] += add([[RS1]],r{{[0-9]+}})
+; CHECK: [[RS3:r[0-9]+]] += add([[RS2]],r{{[0-9]+}})
+; CHECK: [[RS4:r[0-9]+]] += add([[RS3]],r{{[0-9]+}})
+; CHECK: [[RS5:r[0-9]+]] += add([[RS4]],r{{[0-9]+}})
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> %vec)
+  ret i32 %r
+}
+
+define i32 @add_v3i32(<3 x i32> %vec) #0 {
+; CHECK-LABEL: add_v3i32:
+; CHECK: r{{[0-9]+}} += add(r{{[0-9]+}},r{{[0-9]+}})
+entry:
+  %r = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %vec)
+  ret i32 %r
+}
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }
--- a/llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll
@ -0,0 +1,145 @@
+; RUN: llc -mtriple=hexagon < %s | FileCheck %s
+
+define i32 @full_reduce_i32_128i8_uu(<128 x i8> %x, <128 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_128i8_uu:
+; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
+; CHECK: [[A]].uw += vrmpy(v0.ub,v1.ub)
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: vextract
+  %x.wide = zext <128 x i8> %x to <128 x i32>
+  %y.wide = zext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
+  ret i32 %reduce
+}
+
+define i32 @full_reduce_i32_128i8_su(<128 x i8> %x, <128 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_128i8_su:
+; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
+; CHECK: [[A]].w += vrmpy(v1.ub,v0.b)
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: vextract
+  %x.wide = sext <128 x i8> %x to <128 x i32>
+  %y.wide = zext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
+  ret i32 %reduce
+}
+
+define i32 @full_reduce_i32_128i8_us(<128 x i8> %x, <128 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_128i8_us:
+; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
+; CHECK: [[A]].w += vrmpy(v0.ub,v1.b)
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: vextract
+  %x.wide = zext <128 x i8> %x to <128 x i32>
+  %y.wide = sext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
+  ret i32 %reduce
+}
+
+define i32 @full_reduce_i32_128i8_ss(<128 x i8> %x, <128 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_128i8_ss:
+; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
+; CHECK: [[A]].w += vrmpy(v0.b,v1.b)
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: vextract
+  %x.wide = sext <128 x i8> %x to <128 x i32>
+  %y.wide = sext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
+  ret i32 %reduce
+}
+
+;; Double-vector input.
+
+define i32 @full_reduce_i32_256i8(<256 x i8> %x, <256 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_256i8:
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+  %x.wide = zext <256 x i8> %x to <256 x i32>
+  %y.wide = zext <256 x i8> %y to <256 x i32>
+  %m = mul nuw nsw <256 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v256i32(<256 x i32> %m)
+  ret i32 %reduce
+}
+
+;; Maximum handled vector size.
+
+define i32 @full_reduce_i32_1024i8(<1024 x i8> %x, <1024 x i8> %y) #0 {
+; CHECK-LABEL: full_reduce_i32_1024i8:
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vrmpy
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+; CHECK: valign
+; CHECK: vadd
+  %x.wide = zext <1024 x i8> %x to <1024 x i32>
+  %y.wide = zext <1024 x i8> %y to <1024 x i32>
+  %m = mul nuw nsw <1024 x i32> %x.wide, %y.wide
+  %reduce = tail call i32 @llvm.vector.reduce.add.v1024i32(<1024 x i32> %m)
+  ret i32 %reduce
+}
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }
--- a/llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll
@ -0,0 +1,162 @@
+;; Check HVX vectorization.
+; RUN: llc -mtriple hexagon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-HVX
+
+;; Check that there is no failure when compiling to scalar code, don't check the output.
+; RUN: llc -mtriple hexagon -mattr=-hvx,-hvxv73,-hvx-length128b < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-HVX
+
+define <16 x i32> @partial_reduce_uu_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
+; CHECK-LABEL: partial_reduce_uu_64:
+; CHECK-HVX:    v0.uw += vrmpy(v1.ub,v2.ub)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <64 x i8> %x to <64 x i32>
+  %y.ext = zext <64 x i8> %y to <64 x i32>
+  %m = mul nuw nsw <64 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
+  ret <16 x i32> %partial.reduce
+}
+
+define <16 x i32> @partial_reduce_su_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
+; CHECK-LABEL: partial_reduce_su_64:
+; CHECK-HVX:    v0.w += vrmpy(v2.ub,v1.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = sext <64 x i8> %x to <64 x i32>
+  %y.ext = zext <64 x i8> %y to <64 x i32>
+  %m = mul nuw nsw <64 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
+  ret <16 x i32> %partial.reduce
+}
+
+define <16 x i32> @partial_reduce_us_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
+; CHECK-LABEL: partial_reduce_us_64:
+; CHECK-HVX:    v0.w += vrmpy(v1.ub,v2.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <64 x i8> %x to <64 x i32>
+  %y.ext = sext <64 x i8> %y to <64 x i32>
+  %m = mul nuw nsw <64 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
+  ret <16 x i32> %partial.reduce
+}
+
+define <16 x i32> @partial_reduce_ss_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
+; CHECK-LABEL: partial_reduce_ss_64:
+; CHECK-HVX:    v0.w += vrmpy(v1.b,v2.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = sext <64 x i8> %x to <64 x i32>
+  %y.ext = sext <64 x i8> %y to <64 x i32>
+  %m = mul nuw nsw <64 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
+  ret <16 x i32> %partial.reduce
+}
+
+define <32 x i32> @partial_reduce_uu_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_uu_128:
+; CHECK-HVX:    v0.uw += vrmpy(v1.ub,v2.ub)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <128 x i8> %x to <128 x i32>
+  %y.ext = zext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+define <32 x i32> @partial_reduce_su_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_su_128:
+; CHECK-HVX:    v0.w += vrmpy(v2.ub,v1.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = sext <128 x i8> %x to <128 x i32>
+  %y.ext = zext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+define <32 x i32> @partial_reduce_us_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_us_128:
+; CHECK-HVX:    v0.w += vrmpy(v1.ub,v2.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <128 x i8> %x to <128 x i32>
+  %y.ext = sext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+define <32 x i32> @partial_reduce_ss_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_ss_128:
+; CHECK-HVX:    v0.w += vrmpy(v1.b,v2.b)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = sext <128 x i8> %x to <128 x i32>
+  %y.ext = sext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+;; Multiple-size inputs, same output size.
+define <32 x i32> @partial_reduce_uu_32xi32_256xi8(<32 x i32> %acc, <256 x i8> %x, <256 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_uu_32xi32_256xi8:
+; CHECK-HVX:    [[R1:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
+; CHECK-HVX:    [[R2:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
+; CHECK-HVX:    [[R3:v[0-9]+]].w = vadd(v0.w,[[R1]].w)
+; CHECK-HVX:    v0.w = vadd([[R2]].w,[[R3]].w)
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <256 x i8> %x to <256 x i32>
+  %y.ext = zext <256 x i8> %y to <256 x i32>
+  %m = mul nuw nsw <256 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v256i32(<32 x i32> %acc, <256 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+define <32 x i32> @partial_reduce_uu_32xi32_1024xi8(<32 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_uu_32xi32_1024xi8:
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-HVX-DAG: vrmpy
+; CHECK-HVX-DAG: vadd
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+  %x.ext = zext <1024 x i8> %x to <1024 x i32>
+  %y.ext = zext <1024 x i8> %y to <1024 x i32>
+  %m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<32 x i32> %acc, <1024 x i32> %m)
+  ret <32 x i32> %partial.reduce
+}
+
+define <256 x i32> @partial_reduce_uu_64xi32_1024xi8(<256 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_uu_64xi32_1024xi8:
+; CHECK-HVX-COUNT-8: vrmpy
+; CHECK-HVX-NOT: vadd
+; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
+; CHECK-HVX: dealloc_return
+  %x.ext = zext <1024 x i8> %x to <1024 x i32>
+  %y.ext = zext <1024 x i8> %y to <1024 x i32>
+  %m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <256 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<256 x i32> %acc, <1024 x i32> %m)
+  ret <256 x i32> %partial.reduce
+}
+
+;; Check for vector size that do not match an available vrmpy (2x reduction).
+define <64 x i32> @partial_reduce_unsupported(<64 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
+; CHECK-LABEL: partial_reduce_unsupported:
+; CHECK-HVX: vmpy
+; CHECK-HVX: vadd
+  %x.ext = zext <128 x i8> %x to <128 x i32>
+  %y.ext = zext <128 x i8> %y to <128 x i32>
+  %m = mul nuw nsw <128 x i32> %x.ext, %y.ext
+  %partial.reduce = tail call <64 x i32> @llvm.vector.partial.reduce.add.v64i32.v128i32(<64 x i32> %acc, <128 x i32> %m)
+  ret <64 x i32> %partial.reduce
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" }
+attributes #1 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length128b" }