[Hexagon] Support partial reduction intrinsics (#179797)

This commit has changes necessary for using vrmpy instructions in full and partial multiply/add reductions on extended arguments. There are three main parts:
- partial reduction operations PARTIAL_REDUCE_(U|S|SU)MLA are lowered to accumulating vrmpy, including native and multiples of native vector sizes;
- full and partial reductions can be "split" into an inner partial reduction and a residual full or partial reduction. The inner reduction will be lowered to vrmpy due to the first change;
- vecreduce_add expansion is moved to Hexagon backend from a generic pass, accompanied by a set of tests.

In addition, there is a minor cleanup in HexagonTargetLowering::PerformDAGCombine().
This commit is contained in:
Alexey Karyakin 2026-02-17 14:56:05 -05:00 committed by GitHub
parent 42618de278
commit 98d8b69dfc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 884 additions and 60 deletions

View File

@ -1506,6 +1506,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
MaxStoresPerMemset = 8;
MaxStoresPerMemsetOptSize = 4;
setTargetDAGCombine(ISD::VECREDUCE_ADD);
//
// Set up register classes.
//
@ -3413,16 +3415,50 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
SDValue
HexagonTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue Op(N, 0);
const SDLoc &dl(Op);
unsigned Opc = Op.getOpcode();
// Combining transformations applicable for arbitrary vector sizes.
if (DCI.isBeforeLegalizeOps()) {
switch (Opc) {
case ISD::VECREDUCE_ADD:
if (SDValue V = splitVecReduceAdd(N, DCI.DAG))
return V;
if (SDValue V = expandVecReduceAdd(N, DCI.DAG))
return V;
return SDValue();
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
if (SDValue V = splitExtendingPartialReduceMLA(N, DCI.DAG))
return V;
return SDValue();
}
} else {
switch (Opc) {
case ISD::VSELECT: {
// (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
SDValue Cond = Op.getOperand(0);
if (Cond->getOpcode() == ISD::XOR) {
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
if (C1->getOpcode() == HexagonISD::PTRUE) {
SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
Op.getOperand(2), Op.getOperand(1));
return VSel;
}
}
return SDValue();
}
}
}
if (isHvxOperation(N, DCI.DAG)) {
if (SDValue V = PerformHvxDAGCombine(N, DCI))
return V;
return SDValue();
}
SDValue Op(N, 0);
const SDLoc &dl(Op);
unsigned Opc = Op.getOpcode();
if (Opc == ISD::TRUNCATE) {
SDValue Op0 = Op.getOperand(0);
// fold (truncate (build pair x, y)) -> (truncate x) or x
@ -3441,7 +3477,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (Opc == HexagonISD::P2D) {
switch (Opc) {
case HexagonISD::P2D: {
SDValue P = Op.getOperand(0);
switch (P.getOpcode()) {
case HexagonISD::PTRUE:
@ -3451,20 +3488,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
default:
break;
}
} else if (Opc == ISD::VSELECT) {
// This is pretty much duplicated in HexagonISelLoweringHVX...
//
// (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
SDValue Cond = Op.getOperand(0);
if (Cond->getOpcode() == ISD::XOR) {
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
if (C1->getOpcode() == HexagonISD::PTRUE) {
SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
Op.getOperand(2), Op.getOperand(1));
return VSel;
}
}
} else if (Opc == ISD::TRUNCATE) {
break;
}
case ISD::TRUNCATE: {
SDValue Op0 = Op.getOperand(0);
// fold (truncate (build pair x, y)) -> (truncate x) or x
if (Op0.getOpcode() == ISD::BUILD_PAIR) {
@ -3477,7 +3503,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
if (ty(Elem0).bitsGT(TruncTy))
return DCI.DAG.getNode(ISD::TRUNCATE, dl, TruncTy, Elem0);
}
} else if (Opc == ISD::OR) {
break;
}
case ISD::OR: {
// fold (or (shl xx, s), (zext y)) -> (COMBINE (shl xx, s-32), y)
// if s >= 32
auto fold0 = [&, this](SDValue Op) {
@ -3507,6 +3535,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue R = fold0(Op))
return R;
break;
}
}
return SDValue();
@ -3750,6 +3780,78 @@ EVT HexagonTargetLowering::getOptimalMemOpType(
return MVT::Other;
}
// The helpers below are versions of llvm::getShuffleReduction and
// llvm::getOrderedReduction, adapted to use during DAG passes and simplified as
// follows:
// - ICmp and FCmp are not handled;
// - in every step in getShuffleReduction, the input is split into halves (not
// pairwise).
static SDValue getOrderedReduction(SDValue Vec, unsigned Op,
SelectionDAG &DAG) {
assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
EVT VT = Vec.getValueType();
EVT EltT = VT.getVectorElementType();
unsigned VF = VT.getVectorNumElements();
assert(VF > 0 &&
"Reduction emission only supported for non-zero length vectors!");
SDLoc DL(Vec);
SDValue Result = DAG.getExtractVectorElt(DL, EltT, Vec, 0);
for (unsigned ExtractIdx = 1; ExtractIdx < VF; ++ExtractIdx) {
SDValue Ext = DAG.getExtractVectorElt(DL, EltT, Vec, ExtractIdx);
Result = DAG.getNode(Op, DL, EltT, {Result, Ext});
}
return Result;
}
static SDValue getShuffleReduction(SDValue Vec, unsigned Op,
SelectionDAG &DAG) {
assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
EVT VT = Vec.getValueType();
unsigned VF = VT.getVectorNumElements();
if (VF == 0)
llvm_unreachable("Vector must be non-zero length");
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
// and vector ops, reducing the set of values being computed by half each
// round.
assert(isPowerOf2_32(VF) &&
"Reduction emission only supported for pow2 vectors!");
SDLoc DL(Vec);
// TODO: Is it correct to create double-vector shuffle and fill 3/4 of it with
// undefs?
SmallVector<int, 32> ShuffleMask(VF);
for (unsigned i = VF; i > 1; i >>= 1) {
// Move the upper half of the vector to the lower half.
for (unsigned j = 0; j != i / 2; ++j)
ShuffleMask[j] = i / 2 + j;
// Fill the rest of the mask with undef.
std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
SDValue Shuf =
DAG.getVectorShuffle(VT, DL, Vec, DAG.getUNDEF(VT), ShuffleMask);
Vec = DAG.getNode(Op, DL, VT, {Vec, Shuf});
}
// The result is in the first element of the vector.
return DAG.getExtractVectorElt(DL, VT.getVectorElementType(), Vec, 0);
}
SDValue HexagonTargetLowering::expandVecReduceAdd(SDNode *N,
SelectionDAG &DAG) const {
// Since we disabled automatic reduction expansion, generate log2 ladder code
// if the vector is of a power-of-two length.
SDValue Input = N->getOperand(0);
if (isPowerOf2_32(Input.getValueType().getVectorNumElements()))
return getShuffleReduction(Input, ISD::ADD, DAG);
// Otherwise, reduction will be scalarized.
return getOrderedReduction(Input, ISD::ADD, DAG);
}
bool HexagonTargetLowering::allowsMemoryAccess(
LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const {

View File

@ -485,6 +485,7 @@ private:
SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxPred32ToFp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxPred64ToFp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxPartialReduceMLA(SDValue Op, SelectionDAG &DAG) const;
SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
@ -519,10 +520,14 @@ private:
SDValue combineTruncateBeforeLegal(SDValue Op, DAGCombinerInfo &DCI) const;
SDValue combineConcatVectorsBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
const;
SDValue combineVectorShuffleBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
const;
SDValue PerformHvxDAGCombine(SDNode * N, DAGCombinerInfo & DCI) const;
SDValue expandVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
SDValue createExtendingPartialReduceMLA(
unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
const SDLoc &DL, SelectionDAG &DAG) const;
SDValue splitVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
SDValue splitExtendingPartialReduceMLA(SDNode *N, SelectionDAG &DAG) const;
SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
};
} // end namespace llvm

View File

@ -40,6 +40,8 @@ static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
static const unsigned MaxExpandMLA = 8;
static std::tuple<unsigned, unsigned, unsigned> getIEEEProperties(MVT Ty) {
// For a float scalar type, return (exp-bits, exp-bias, fraction-bits)
MVT ElemTy = Ty.getScalarType();
@ -504,6 +506,69 @@ HexagonTargetLowering::initializeHVXLowering() {
setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom);
setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT});
setTargetDAGCombine({ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA,
ISD::PARTIAL_REDUCE_SUMLA});
// Partial MLA reductions.
{
static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
ISD::PARTIAL_REDUCE_UMLA,
ISD::PARTIAL_REDUCE_SUMLA};
auto HvxType = [=](MVT ScalarT, unsigned Factor = 1) {
return MVT::getVectorVT(ScalarT, Subtarget.getVectorLength() * Factor *
8 / ScalarT.getSizeInBits());
};
// Tuple of (Acc element type, input element type, vector pair).
// The assumption is both the input and reduction result are of the same
// size so the reduction ratio is the same as the ratio of element type
// sizes. This may not hold for all available instructions.
typedef std::tuple<MVT, MVT, bool> ReductionSignature;
static const std::vector<ReductionSignature> NativeReductions = {
{MVT::i32, MVT::i8, false},
};
for (const auto &R : NativeReductions) {
MVT AccType = std::get<0>(R);
MVT InputType = std::get<1>(R);
unsigned Factor = std::get<2>(R) ? 2 : 1;
// The native size is legal.
setPartialReduceMLAAction(MLAOps, HvxType(AccType), HvxType(InputType),
Legal);
// Allow custom partial MLA reductions on larger vectors than legally
// supported. These reduction must be declared as Custom (or Legal)
// for foldPartialReduceMLAMulOp() to fold the multiply by one pattern
// inserted when the partial reduction intrinsic is converted to
// PARTIAL_REDUCE_U/S/SUMLA. Otherwise, the Split action will apply
// on the original pattern, including the extensions and multiplies,
// which will make it impossible to match.
// There are two independent ways to extend the
// input size: 1. to concatenate the result - output vector is
// proportionally extended, 2) to reduce the result - the output vector
// size stays the same. We limit allowed combinations so that the total
// number of generated reduction instructions is limited by a constant
// number. This limit is arbitrary and can be revised. On one hand, it is
// convenient to have more choices; on the other hand, there is a
// diminishing benefit of very long sequences, which should probably be
// written as loops instead.
for (unsigned ConcatFactor = 1; ConcatFactor <= MaxExpandMLA;
ConcatFactor <<= 1)
for (unsigned ReductionFactor = 1; ReductionFactor <= MaxExpandMLA;
ReductionFactor <<= 1)
if (ConcatFactor * ReductionFactor != 1 &&
ConcatFactor * ReductionFactor <= MaxExpandMLA)
setPartialReduceMLAAction(
MLAOps, HvxType(AccType, Factor * ConcatFactor),
HvxType(InputType, Factor * ConcatFactor * ReductionFactor),
Custom);
}
}
}
unsigned
@ -3678,6 +3743,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
case HexagonISD::SMUL_LOHI:
case HexagonISD::UMUL_LOHI:
case HexagonISD::USMUL_LOHI: return LowerHvxMulLoHi(Op, DAG);
case ISD::PARTIAL_REDUCE_SMLA:
case ISD::PARTIAL_REDUCE_UMLA:
case ISD::PARTIAL_REDUCE_SUMLA:
return LowerHvxPartialReduceMLA(Op, DAG);
// clang-format on
}
#ifndef NDEBUG
@ -4020,6 +4090,198 @@ HexagonTargetLowering::combineConcatVectorsBeforeLegal(
return DAG.getVectorShuffle(LongTy, dl, Cat, DAG.getUNDEF(LongTy), LongMask);
}
// Create the inner partial reduction MLA that can be efficiently lowered. This
// function is used by partial and full reductions.
SDValue HexagonTargetLowering::createExtendingPartialReduceMLA(
unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
const SDLoc &DL, SelectionDAG &DAG) const {
const auto &Subtarget = DAG.getSubtarget<HexagonSubtarget>();
if (!Subtarget.useHVXOps())
return SDValue();
EVT InputEltType = InputType.getVectorElementType();
// Find if an optimized instruction for the sub-reduction is available.
unsigned NativeRatio;
if (AccEltType == MVT::i32 && InputEltType == MVT::i8)
NativeRatio = 4;
else
return SDValue();
// We only handle the case when additional reduction will be needed, i.e.
// input is longer by a larger factor than the result.
ElementCount InputEC = InputType.getVectorElementCount();
if (!InputEC.isKnownMultipleOf(AccNumElements * NativeRatio))
return SDValue();
unsigned InputNumElements = InputEC.getFixedValue();
RemainingReductionRatio = InputNumElements / (AccNumElements * NativeRatio);
if (RemainingReductionRatio == 1)
return SDValue();
// Create a reduction by the natively supported factor.
EVT IntermediateType = EVT::getVectorVT(*DAG.getContext(), AccEltType,
InputNumElements / NativeRatio);
SDValue Zero = DAG.getConstant(0, DL, IntermediateType);
return DAG.getNode(Opcode, DL, IntermediateType, Zero, A, B);
}
static bool DetectExtendingMultiply(const SDValue &N, EVT ScalarType,
unsigned &Opcode, SDValue &A, SDValue &B) {
SDValue Mul = N;
EVT AccType = Mul.getValueType(); // Vector input type after extension.
if (ScalarType != AccType.getVectorElementType())
return false;
bool swap = false;
if (Mul->getOpcode() != ISD::MUL)
return false;
A = Mul->getOperand(0);
B = Mul->getOperand(1);
if (A.getOpcode() == ISD::ZERO_EXTEND) {
if (B.getOpcode() == ISD::ZERO_EXTEND)
Opcode = ISD::PARTIAL_REDUCE_UMLA;
else if (B.getOpcode() == ISD::SIGN_EXTEND) {
swap = true;
Opcode = ISD::PARTIAL_REDUCE_SUMLA;
} else
return false;
} else if (A.getOpcode() == ISD::SIGN_EXTEND) {
if (B.getOpcode() == ISD::ZERO_EXTEND)
Opcode = ISD::PARTIAL_REDUCE_SUMLA;
else if (B.getOpcode() == ISD::SIGN_EXTEND)
Opcode = ISD::PARTIAL_REDUCE_SMLA;
else
return false;
} else
return false;
// Get multiplication arguments before extension.
A = A->getOperand(0);
B = B->getOperand(0);
if (A.getValueType() != B.getValueType())
return false;
if (swap)
std::swap(A, B);
return true;
}
SDValue HexagonTargetLowering::splitVecReduceAdd(SDNode *N,
SelectionDAG &DAG) const {
if (!Subtarget.useHVXOps())
return SDValue();
EVT ScalarType = N->getValueType(0);
unsigned Opcode;
SDValue A, B;
if (!DetectExtendingMultiply(N->getOperand(0), ScalarType, Opcode, A, B))
return SDValue();
SDLoc DL(N);
unsigned RemainingReductionRatio;
SDValue Partial =
createExtendingPartialReduceMLA(Opcode, ScalarType, 1, A.getValueType(),
A, B, RemainingReductionRatio, DL, DAG);
if (!Partial)
return SDValue();
// We could have inserted a trivial MLA and rely on the folding action,
// similar to how vector_partial_reduce_add is lowered to an MLA in
// SelectionDAGBuilder. However, we just replace the final result since we
// have analyzed the input completely.
return DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarType, Partial);
}
// When possible, separate an MLA reduction with extended operands but
// unsupported reduction factor into an extending partial reduction that
// can be efficiently lowered, and a follow-up partial reduction.
// partial_reduce_mla(a, x, y) ->
// partial_reduce_mla(a, partial_reduce_mla(0, x, y), 1)
SDValue
HexagonTargetLowering::splitExtendingPartialReduceMLA(SDNode *N,
SelectionDAG &DAG) const {
if (!Subtarget.useHVXOps())
return SDValue();
SDValue Acc = N->getOperand(0);
SDValue A = N->getOperand(1);
SDValue B = N->getOperand(2);
if (A.getValueType() != B.getValueType())
return SDValue();
// The types should be declared as custom, but do not split already legal
// operation.
EVT AccType = Acc.getValueType();
EVT InputType = A.getValueType();
if (getPartialReduceMLAAction(N->getOpcode(), AccType, InputType) != Custom)
return SDValue();
SDLoc DL(N);
unsigned RemainingReductionRatio;
SDValue Partial = createExtendingPartialReduceMLA(
N->getOpcode(), AccType.getVectorElementType(),
AccType.getVectorNumElements(), InputType, A, B, RemainingReductionRatio,
DL, DAG);
if (!Partial)
return SDValue();
assert(RemainingReductionRatio <= MaxExpandMLA);
// Create the reduction for the remaining ratio.
EVT IntermediateType = Partial->getOperand(0).getValueType();
SDValue One = DAG.getConstant(1, DL, IntermediateType);
return DAG.getNode(N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
? ISD::PARTIAL_REDUCE_UMLA
: ISD::PARTIAL_REDUCE_SUMLA,
DL, AccType, Acc, Partial, One);
}
SDValue
HexagonTargetLowering::LowerHvxPartialReduceMLA(SDValue Op,
SelectionDAG &DAG) const {
const SDLoc &DL(Op);
SDValue Acc = Op.getOperand(0);
SDValue A = Op.getOperand(1);
SDValue B = Op.getOperand(2);
// Split the input vectors into units of one HVX vector length.
unsigned HwVectorSizeInBits = Subtarget.getVectorLength() * 8;
EVT AccType = Acc.getValueType();
EVT AccEltType = AccType.getVectorElementType();
unsigned AccSubvectorNumElements =
HwVectorSizeInBits / AccEltType.getSizeInBits();
EVT AccSubvectorType =
EVT::getVectorVT(*DAG.getContext(), AccEltType, AccSubvectorNumElements);
EVT InputType = A.getValueType();
assert(InputType.getSizeInBits() % HwVectorSizeInBits == 0);
EVT InputEltType = InputType.getVectorElementType();
unsigned InputSubvectorNumElements =
HwVectorSizeInBits / InputEltType.getSizeInBits();
EVT InputSubvectorType = EVT::getVectorVT(*DAG.getContext(), InputEltType,
InputSubvectorNumElements);
unsigned SubvectorNum = InputType.getFixedSizeInBits() / HwVectorSizeInBits;
SmallVector<SDValue, MaxExpandMLA> Subvectors;
for (unsigned I = 0; I != SubvectorNum; ++I) {
SDValue SubvectorAcc = DAG.getExtractSubvector(DL, AccSubvectorType, Acc,
I * AccSubvectorNumElements);
SDValue SubvectorA = DAG.getExtractSubvector(DL, InputSubvectorType, A,
I * InputSubvectorNumElements);
SDValue SubvectorB = DAG.getExtractSubvector(DL, InputSubvectorType, B,
I * InputSubvectorNumElements);
SDValue SubvectorMLA = DAG.getNode(Op.getOpcode(), DL, AccSubvectorType,
SubvectorAcc, SubvectorA, SubvectorB);
Subvectors.push_back(SubvectorMLA);
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, AccType, Subvectors);
}
SDValue
HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
const {
@ -4039,43 +4301,33 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
return SDValue();
switch (Opc) {
case ISD::VSELECT: {
// (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
SDValue Cond = Ops[0];
if (Cond->getOpcode() == ISD::XOR) {
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
if (C1->getOpcode() == HexagonISD::QTRUE)
return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]);
}
break;
case HexagonISD::V2Q:
if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
: DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
}
case HexagonISD::V2Q:
if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
: DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
}
break;
case HexagonISD::Q2V:
if (Ops[0].getOpcode() == HexagonISD::QTRUE)
return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
DAG.getAllOnesConstant(dl, MVT::i32));
if (Ops[0].getOpcode() == HexagonISD::QFALSE)
return getZero(dl, ty(Op), DAG);
break;
case HexagonISD::VINSERTW0:
if (isUndef(Ops[1]))
return Ops[0];
break;
case HexagonISD::VROR: {
if (Ops[0].getOpcode() == HexagonISD::VROR) {
SDValue Vec = Ops[0].getOperand(0);
SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
}
break;
break;
case HexagonISD::Q2V:
if (Ops[0].getOpcode() == HexagonISD::QTRUE)
return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
DAG.getAllOnesConstant(dl, MVT::i32));
if (Ops[0].getOpcode() == HexagonISD::QFALSE)
return getZero(dl, ty(Op), DAG);
break;
case HexagonISD::VINSERTW0:
if (isUndef(Ops[1]))
return Ops[0];
break;
case HexagonISD::VROR: {
if (Ops[0].getOpcode() == HexagonISD::VROR) {
SDValue Vec = Ops[0].getOperand(0);
SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
}
break;
}
}
return SDValue();

View File

@ -456,6 +456,13 @@ let Predicates = [UseHVX] in {
(VShuff (V6_vmpyhus_acc (VDeal $Vx, -4),
HVI16:$Vs, HVI16:$Vt), -4)>;
}
def : Pat<(VecI32 (partial_reduce_umla VecI32:$Acc, HVI8:$A, HVI8:$B)),
(V6_vrmpyubv_acc $Acc, $A, $B)>;
def : Pat<(VecI32 (partial_reduce_smla VecI32:$Acc, HVI8:$A, HVI8:$B)),
(V6_vrmpybv_acc $Acc, $A, $B)>;
def : Pat<(VecI32 (partial_reduce_sumla VecI32:$Acc, HVI8:$A, HVI8:$B)),
(V6_vrmpybusv_acc $Acc, $B, $A)>;
}
let Predicates = [UseHVX] in {

View File

@ -327,6 +327,14 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(
return 1;
}
bool HexagonTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
switch (II->getIntrinsicID()) {
case Intrinsic::vector_reduce_add:
return false;
}
return true;
}
bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
unsigned /*AddressSpace*/,
TTI::MaskKind /*MaskKind*/) const {

View File

@ -156,7 +156,7 @@ public:
const Instruction *I = nullptr) const override {
return 1;
}
bool shouldExpandReduction(const IntrinsicInst *II) const override;
bool isLegalMaskedStore(Type *DataType, Align Alignment,
unsigned AddressSpace,
TTI::MaskKind MaskKind) const override;

View File

@ -0,0 +1,143 @@
; RUN: llc -mtriple=hexagon < %s | FileCheck %s
target triple = "hexagon"
define i32 @add_v32i32(<32 x i32> %vec) #0 {
; CHECK-LABEL: add_v32i32:
; CHECK: {
; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R8:v[0-9]+]] = valign([[_:v[0-9]+]],[[R7]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R9:v[0-9]+]].w = vadd([[R7]].w,[[R8]].w)
; CHECK: }
; CHECK: {
; CHECK: r0 = vextract([[R9]],{{.+}})
; CHECK: }
entry:
%r = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %vec)
ret i32 %r
}
define i32 @add_v16i32(<16 x i32> %vec) #0 {
; CHECK-LABEL: add_v16i32:
; CHECK: {
; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
; CHECK: }
; CHECK: {
; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
; CHECK: }
; CHECK: {
; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
; CHECK: }
; CHECK: {
; CHECK: r0 = vextract([[R7]],{{.+}})
; CHECK: }
entry:
%r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %vec)
ret i32 %r
}
define i32 @add_v8i32(<8 x i32> %vec) #0 {
; CHECK-LABEL: add_v8i32:
; CHECK: {
; CHECK: r[[RS1:[0-9]+:[0-9]+]] = vaddw(r1:0,r5:4)
; CHECK: r[[R6:[0-9]+:[0-9]+]] = memd(r29+#0)
; CHECK: }
; CHECK: {
; CHECK: r[[RS2:[0-9]+:[0-9]+]] = vaddw(r3:2,r[[R6]])
; CHECK: }
; CHECK: {
; CHECK: r[[RS3:[0-9]+:[0-9]+]] = vaddw(r[[RS1]],r[[RS2]])
; CHECK: }
; CHECK: {
;; TODO: combine and double register add can be optimized to single register add.
; CHECK: r[[RS4:[0-9]+:[0-9]+]] = combine(#0,r{{[0-9]+}})
; CHECK: }
; CHECK: {
; CHECK: r1:0 = vaddw(r[[RS3]],r[[RS4]])
entry:
%r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %vec)
ret i32 %r
}
define i32 @add_v64i32(<64 x i32> %vec) #0 {
; CHECK-LABEL: add_v64i32:
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
entry:
%r = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %vec)
ret i32 %r
}
;; Non-pow2 vectors are scalarized.
define i32 @add_v12i32(<12 x i32> %vec) #0 {
; CHECK-LABEL: add_v12i32:
; CHECK: [[RS0:r[0-9]+]] = add(r0,r1)
; CHECK: [[RS1:r[0-9]+]] += add([[RS0]],r{{[0-9]+}})
; CHECK: [[RS2:r[0-9]+]] += add([[RS1]],r{{[0-9]+}})
; CHECK: [[RS3:r[0-9]+]] += add([[RS2]],r{{[0-9]+}})
; CHECK: [[RS4:r[0-9]+]] += add([[RS3]],r{{[0-9]+}})
; CHECK: [[RS5:r[0-9]+]] += add([[RS4]],r{{[0-9]+}})
entry:
%r = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> %vec)
ret i32 %r
}
define i32 @add_v3i32(<3 x i32> %vec) #0 {
; CHECK-LABEL: add_v3i32:
; CHECK: r{{[0-9]+}} += add(r{{[0-9]+}},r{{[0-9]+}})
entry:
%r = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %vec)
ret i32 %r
}
attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }

View File

@ -0,0 +1,145 @@
; RUN: llc -mtriple=hexagon < %s | FileCheck %s
define i32 @full_reduce_i32_128i8_uu(<128 x i8> %x, <128 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_128i8_uu:
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
; CHECK: [[A]].uw += vrmpy(v0.ub,v1.ub)
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: vextract
%x.wide = zext <128 x i8> %x to <128 x i32>
%y.wide = zext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
ret i32 %reduce
}
define i32 @full_reduce_i32_128i8_su(<128 x i8> %x, <128 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_128i8_su:
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
; CHECK: [[A]].w += vrmpy(v1.ub,v0.b)
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: vextract
%x.wide = sext <128 x i8> %x to <128 x i32>
%y.wide = zext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
ret i32 %reduce
}
define i32 @full_reduce_i32_128i8_us(<128 x i8> %x, <128 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_128i8_us:
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
; CHECK: [[A]].w += vrmpy(v0.ub,v1.b)
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: vextract
%x.wide = zext <128 x i8> %x to <128 x i32>
%y.wide = sext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
ret i32 %reduce
}
define i32 @full_reduce_i32_128i8_ss(<128 x i8> %x, <128 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_128i8_ss:
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
; CHECK: [[A]].w += vrmpy(v0.b,v1.b)
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: vextract
%x.wide = sext <128 x i8> %x to <128 x i32>
%y.wide = sext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
ret i32 %reduce
}
;; Double-vector input.
define i32 @full_reduce_i32_256i8(<256 x i8> %x, <256 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_256i8:
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
%x.wide = zext <256 x i8> %x to <256 x i32>
%y.wide = zext <256 x i8> %y to <256 x i32>
%m = mul nuw nsw <256 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v256i32(<256 x i32> %m)
ret i32 %reduce
}
;; Maximum handled vector size.
define i32 @full_reduce_i32_1024i8(<1024 x i8> %x, <1024 x i8> %y) #0 {
; CHECK-LABEL: full_reduce_i32_1024i8:
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vrmpy
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
; CHECK: valign
; CHECK: vadd
%x.wide = zext <1024 x i8> %x to <1024 x i32>
%y.wide = zext <1024 x i8> %y to <1024 x i32>
%m = mul nuw nsw <1024 x i32> %x.wide, %y.wide
%reduce = tail call i32 @llvm.vector.reduce.add.v1024i32(<1024 x i32> %m)
ret i32 %reduce
}
attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }

View File

@ -0,0 +1,162 @@
;; Check HVX vectorization.
; RUN: llc -mtriple hexagon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-HVX
;; Check that there is no failure when compiling to scalar code, don't check the output.
; RUN: llc -mtriple hexagon -mattr=-hvx,-hvxv73,-hvx-length128b < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-HVX
define <16 x i32> @partial_reduce_uu_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
; CHECK-LABEL: partial_reduce_uu_64:
; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <64 x i8> %x to <64 x i32>
%y.ext = zext <64 x i8> %y to <64 x i32>
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
ret <16 x i32> %partial.reduce
}
define <16 x i32> @partial_reduce_su_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
; CHECK-LABEL: partial_reduce_su_64:
; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = sext <64 x i8> %x to <64 x i32>
%y.ext = zext <64 x i8> %y to <64 x i32>
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
ret <16 x i32> %partial.reduce
}
define <16 x i32> @partial_reduce_us_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
; CHECK-LABEL: partial_reduce_us_64:
; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <64 x i8> %x to <64 x i32>
%y.ext = sext <64 x i8> %y to <64 x i32>
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
ret <16 x i32> %partial.reduce
}
define <16 x i32> @partial_reduce_ss_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
; CHECK-LABEL: partial_reduce_ss_64:
; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = sext <64 x i8> %x to <64 x i32>
%y.ext = sext <64 x i8> %y to <64 x i32>
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
ret <16 x i32> %partial.reduce
}
define <32 x i32> @partial_reduce_uu_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_uu_128:
; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <128 x i8> %x to <128 x i32>
%y.ext = zext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
ret <32 x i32> %partial.reduce
}
define <32 x i32> @partial_reduce_su_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_su_128:
; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = sext <128 x i8> %x to <128 x i32>
%y.ext = zext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
ret <32 x i32> %partial.reduce
}
define <32 x i32> @partial_reduce_us_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_us_128:
; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <128 x i8> %x to <128 x i32>
%y.ext = sext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
ret <32 x i32> %partial.reduce
}
define <32 x i32> @partial_reduce_ss_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_ss_128:
; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = sext <128 x i8> %x to <128 x i32>
%y.ext = sext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
ret <32 x i32> %partial.reduce
}
;; Multiple-size inputs, same output size.
define <32 x i32> @partial_reduce_uu_32xi32_256xi8(<32 x i32> %acc, <256 x i8> %x, <256 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_uu_32xi32_256xi8:
; CHECK-HVX: [[R1:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
; CHECK-HVX: [[R2:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
; CHECK-HVX: [[R3:v[0-9]+]].w = vadd(v0.w,[[R1]].w)
; CHECK-HVX: v0.w = vadd([[R2]].w,[[R3]].w)
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <256 x i8> %x to <256 x i32>
%y.ext = zext <256 x i8> %y to <256 x i32>
%m = mul nuw nsw <256 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v256i32(<32 x i32> %acc, <256 x i32> %m)
ret <32 x i32> %partial.reduce
}
define <32 x i32> @partial_reduce_uu_32xi32_1024xi8(<32 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_uu_32xi32_1024xi8:
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-HVX-DAG: vrmpy
; CHECK-HVX-DAG: vadd
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
%x.ext = zext <1024 x i8> %x to <1024 x i32>
%y.ext = zext <1024 x i8> %y to <1024 x i32>
%m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<32 x i32> %acc, <1024 x i32> %m)
ret <32 x i32> %partial.reduce
}
define <256 x i32> @partial_reduce_uu_64xi32_1024xi8(<256 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_uu_64xi32_1024xi8:
; CHECK-HVX-COUNT-8: vrmpy
; CHECK-HVX-NOT: vadd
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
; CHECK-HVX: dealloc_return
%x.ext = zext <1024 x i8> %x to <1024 x i32>
%y.ext = zext <1024 x i8> %y to <1024 x i32>
%m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
%partial.reduce = tail call <256 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<256 x i32> %acc, <1024 x i32> %m)
ret <256 x i32> %partial.reduce
}
;; Check for vector size that do not match an available vrmpy (2x reduction).
define <64 x i32> @partial_reduce_unsupported(<64 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
; CHECK-LABEL: partial_reduce_unsupported:
; CHECK-HVX: vmpy
; CHECK-HVX: vadd
%x.ext = zext <128 x i8> %x to <128 x i32>
%y.ext = zext <128 x i8> %y to <128 x i32>
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
%partial.reduce = tail call <64 x i32> @llvm.vector.partial.reduce.add.v64i32.v128i32(<64 x i32> %acc, <128 x i32> %m)
ret <64 x i32> %partial.reduce
}
attributes #0 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" }
attributes #1 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length128b" }