[Hexagon] Support partial reduction intrinsics (#179797)
This commit has changes necessary for using vrmpy instructions in full and partial multiply/add reductions on extended arguments. There are three main parts: - partial reduction operations PARTIAL_REDUCE_(U|S|SU)MLA are lowered to accumulating vrmpy, including native and multiples of native vector sizes; - full and partial reductions can be "split" into an inner partial reduction and a residual full or partial reduction. The inner reduction will be lowered to vrmpy due to the first change; - vecreduce_add expansion is moved to Hexagon backend from a generic pass, accompanied by a set of tests. In addition, there is a minor cleanup in HexagonTargetLowering::PerformDAGCombine().
This commit is contained in:
parent
42618de278
commit
98d8b69dfc
@ -1506,6 +1506,8 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
|
||||
MaxStoresPerMemset = 8;
|
||||
MaxStoresPerMemsetOptSize = 4;
|
||||
|
||||
setTargetDAGCombine(ISD::VECREDUCE_ADD);
|
||||
|
||||
//
|
||||
// Set up register classes.
|
||||
//
|
||||
@ -3413,16 +3415,50 @@ HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
|
||||
SDValue
|
||||
HexagonTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SDValue Op(N, 0);
|
||||
const SDLoc &dl(Op);
|
||||
unsigned Opc = Op.getOpcode();
|
||||
|
||||
// Combining transformations applicable for arbitrary vector sizes.
|
||||
if (DCI.isBeforeLegalizeOps()) {
|
||||
switch (Opc) {
|
||||
case ISD::VECREDUCE_ADD:
|
||||
if (SDValue V = splitVecReduceAdd(N, DCI.DAG))
|
||||
return V;
|
||||
if (SDValue V = expandVecReduceAdd(N, DCI.DAG))
|
||||
return V;
|
||||
return SDValue();
|
||||
case ISD::PARTIAL_REDUCE_SMLA:
|
||||
case ISD::PARTIAL_REDUCE_UMLA:
|
||||
case ISD::PARTIAL_REDUCE_SUMLA:
|
||||
if (SDValue V = splitExtendingPartialReduceMLA(N, DCI.DAG))
|
||||
return V;
|
||||
return SDValue();
|
||||
}
|
||||
} else {
|
||||
switch (Opc) {
|
||||
case ISD::VSELECT: {
|
||||
// (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
|
||||
SDValue Cond = Op.getOperand(0);
|
||||
if (Cond->getOpcode() == ISD::XOR) {
|
||||
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
|
||||
if (C1->getOpcode() == HexagonISD::PTRUE) {
|
||||
SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
|
||||
Op.getOperand(2), Op.getOperand(1));
|
||||
return VSel;
|
||||
}
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (isHvxOperation(N, DCI.DAG)) {
|
||||
if (SDValue V = PerformHvxDAGCombine(N, DCI))
|
||||
return V;
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue Op(N, 0);
|
||||
const SDLoc &dl(Op);
|
||||
unsigned Opc = Op.getOpcode();
|
||||
|
||||
if (Opc == ISD::TRUNCATE) {
|
||||
SDValue Op0 = Op.getOperand(0);
|
||||
// fold (truncate (build pair x, y)) -> (truncate x) or x
|
||||
@ -3441,7 +3477,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
if (DCI.isBeforeLegalizeOps())
|
||||
return SDValue();
|
||||
|
||||
if (Opc == HexagonISD::P2D) {
|
||||
switch (Opc) {
|
||||
case HexagonISD::P2D: {
|
||||
SDValue P = Op.getOperand(0);
|
||||
switch (P.getOpcode()) {
|
||||
case HexagonISD::PTRUE:
|
||||
@ -3451,20 +3488,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else if (Opc == ISD::VSELECT) {
|
||||
// This is pretty much duplicated in HexagonISelLoweringHVX...
|
||||
//
|
||||
// (vselect (xor x, ptrue), v0, v1) -> (vselect x, v1, v0)
|
||||
SDValue Cond = Op.getOperand(0);
|
||||
if (Cond->getOpcode() == ISD::XOR) {
|
||||
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
|
||||
if (C1->getOpcode() == HexagonISD::PTRUE) {
|
||||
SDValue VSel = DCI.DAG.getNode(ISD::VSELECT, dl, ty(Op), C0,
|
||||
Op.getOperand(2), Op.getOperand(1));
|
||||
return VSel;
|
||||
}
|
||||
}
|
||||
} else if (Opc == ISD::TRUNCATE) {
|
||||
break;
|
||||
}
|
||||
case ISD::TRUNCATE: {
|
||||
SDValue Op0 = Op.getOperand(0);
|
||||
// fold (truncate (build pair x, y)) -> (truncate x) or x
|
||||
if (Op0.getOpcode() == ISD::BUILD_PAIR) {
|
||||
@ -3477,7 +3503,9 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
if (ty(Elem0).bitsGT(TruncTy))
|
||||
return DCI.DAG.getNode(ISD::TRUNCATE, dl, TruncTy, Elem0);
|
||||
}
|
||||
} else if (Opc == ISD::OR) {
|
||||
break;
|
||||
}
|
||||
case ISD::OR: {
|
||||
// fold (or (shl xx, s), (zext y)) -> (COMBINE (shl xx, s-32), y)
|
||||
// if s >= 32
|
||||
auto fold0 = [&, this](SDValue Op) {
|
||||
@ -3507,6 +3535,8 @@ HexagonTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
|
||||
if (SDValue R = fold0(Op))
|
||||
return R;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
@ -3750,6 +3780,78 @@ EVT HexagonTargetLowering::getOptimalMemOpType(
|
||||
return MVT::Other;
|
||||
}
|
||||
|
||||
// The helpers below are versions of llvm::getShuffleReduction and
|
||||
// llvm::getOrderedReduction, adapted to use during DAG passes and simplified as
|
||||
// follows:
|
||||
// - ICmp and FCmp are not handled;
|
||||
// - in every step in getShuffleReduction, the input is split into halves (not
|
||||
// pairwise).
|
||||
|
||||
static SDValue getOrderedReduction(SDValue Vec, unsigned Op,
|
||||
SelectionDAG &DAG) {
|
||||
assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
|
||||
|
||||
EVT VT = Vec.getValueType();
|
||||
EVT EltT = VT.getVectorElementType();
|
||||
unsigned VF = VT.getVectorNumElements();
|
||||
assert(VF > 0 &&
|
||||
"Reduction emission only supported for non-zero length vectors!");
|
||||
|
||||
SDLoc DL(Vec);
|
||||
SDValue Result = DAG.getExtractVectorElt(DL, EltT, Vec, 0);
|
||||
for (unsigned ExtractIdx = 1; ExtractIdx < VF; ++ExtractIdx) {
|
||||
SDValue Ext = DAG.getExtractVectorElt(DL, EltT, Vec, ExtractIdx);
|
||||
Result = DAG.getNode(Op, DL, EltT, {Result, Ext});
|
||||
}
|
||||
|
||||
return Result;
|
||||
}
|
||||
|
||||
static SDValue getShuffleReduction(SDValue Vec, unsigned Op,
|
||||
SelectionDAG &DAG) {
|
||||
assert(Op != Instruction::ICmp && Op != Instruction::FCmp);
|
||||
|
||||
EVT VT = Vec.getValueType();
|
||||
unsigned VF = VT.getVectorNumElements();
|
||||
if (VF == 0)
|
||||
llvm_unreachable("Vector must be non-zero length");
|
||||
// VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
|
||||
// and vector ops, reducing the set of values being computed by half each
|
||||
// round.
|
||||
assert(isPowerOf2_32(VF) &&
|
||||
"Reduction emission only supported for pow2 vectors!");
|
||||
|
||||
SDLoc DL(Vec);
|
||||
// TODO: Is it correct to create double-vector shuffle and fill 3/4 of it with
|
||||
// undefs?
|
||||
SmallVector<int, 32> ShuffleMask(VF);
|
||||
for (unsigned i = VF; i > 1; i >>= 1) {
|
||||
// Move the upper half of the vector to the lower half.
|
||||
for (unsigned j = 0; j != i / 2; ++j)
|
||||
ShuffleMask[j] = i / 2 + j;
|
||||
// Fill the rest of the mask with undef.
|
||||
std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
|
||||
|
||||
SDValue Shuf =
|
||||
DAG.getVectorShuffle(VT, DL, Vec, DAG.getUNDEF(VT), ShuffleMask);
|
||||
|
||||
Vec = DAG.getNode(Op, DL, VT, {Vec, Shuf});
|
||||
}
|
||||
// The result is in the first element of the vector.
|
||||
return DAG.getExtractVectorElt(DL, VT.getVectorElementType(), Vec, 0);
|
||||
}
|
||||
|
||||
SDValue HexagonTargetLowering::expandVecReduceAdd(SDNode *N,
|
||||
SelectionDAG &DAG) const {
|
||||
// Since we disabled automatic reduction expansion, generate log2 ladder code
|
||||
// if the vector is of a power-of-two length.
|
||||
SDValue Input = N->getOperand(0);
|
||||
if (isPowerOf2_32(Input.getValueType().getVectorNumElements()))
|
||||
return getShuffleReduction(Input, ISD::ADD, DAG);
|
||||
// Otherwise, reduction will be scalarized.
|
||||
return getOrderedReduction(Input, ISD::ADD, DAG);
|
||||
}
|
||||
|
||||
bool HexagonTargetLowering::allowsMemoryAccess(
|
||||
LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
|
||||
Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const {
|
||||
|
||||
@ -485,6 +485,7 @@ private:
|
||||
SDValue LowerHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerHvxPred32ToFp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerHvxPred64ToFp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerHvxPartialReduceMLA(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue ExpandHvxFpToInt(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue ExpandHvxIntToFp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
|
||||
@ -519,10 +520,14 @@ private:
|
||||
SDValue combineTruncateBeforeLegal(SDValue Op, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineConcatVectorsBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
|
||||
const;
|
||||
SDValue combineVectorShuffleBeforeLegal(SDValue Op, DAGCombinerInfo & DCI)
|
||||
const;
|
||||
|
||||
SDValue PerformHvxDAGCombine(SDNode * N, DAGCombinerInfo & DCI) const;
|
||||
SDValue expandVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
|
||||
SDValue createExtendingPartialReduceMLA(
|
||||
unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
|
||||
const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
|
||||
const SDLoc &DL, SelectionDAG &DAG) const;
|
||||
SDValue splitVecReduceAdd(SDNode *N, SelectionDAG &DAG) const;
|
||||
SDValue splitExtendingPartialReduceMLA(SDNode *N, SelectionDAG &DAG) const;
|
||||
SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
};
|
||||
|
||||
} // end namespace llvm
|
||||
|
||||
@ -40,6 +40,8 @@ static const MVT LegalW64[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
|
||||
static const MVT LegalV128[] = { MVT::v128i8, MVT::v64i16, MVT::v32i32 };
|
||||
static const MVT LegalW128[] = { MVT::v256i8, MVT::v128i16, MVT::v64i32 };
|
||||
|
||||
static const unsigned MaxExpandMLA = 8;
|
||||
|
||||
static std::tuple<unsigned, unsigned, unsigned> getIEEEProperties(MVT Ty) {
|
||||
// For a float scalar type, return (exp-bits, exp-bias, fraction-bits)
|
||||
MVT ElemTy = Ty.getScalarType();
|
||||
@ -504,6 +506,69 @@ HexagonTargetLowering::initializeHVXLowering() {
|
||||
setOperationAction(ISD::SINT_TO_FP, MVT::v32i1, Custom);
|
||||
|
||||
setTargetDAGCombine({ISD::CONCAT_VECTORS, ISD::TRUNCATE, ISD::VSELECT});
|
||||
|
||||
setTargetDAGCombine({ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA,
|
||||
ISD::PARTIAL_REDUCE_SUMLA});
|
||||
|
||||
// Partial MLA reductions.
|
||||
{
|
||||
static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
|
||||
ISD::PARTIAL_REDUCE_UMLA,
|
||||
ISD::PARTIAL_REDUCE_SUMLA};
|
||||
|
||||
auto HvxType = [=](MVT ScalarT, unsigned Factor = 1) {
|
||||
return MVT::getVectorVT(ScalarT, Subtarget.getVectorLength() * Factor *
|
||||
8 / ScalarT.getSizeInBits());
|
||||
};
|
||||
|
||||
// Tuple of (Acc element type, input element type, vector pair).
|
||||
// The assumption is both the input and reduction result are of the same
|
||||
// size so the reduction ratio is the same as the ratio of element type
|
||||
// sizes. This may not hold for all available instructions.
|
||||
typedef std::tuple<MVT, MVT, bool> ReductionSignature;
|
||||
|
||||
static const std::vector<ReductionSignature> NativeReductions = {
|
||||
{MVT::i32, MVT::i8, false},
|
||||
};
|
||||
|
||||
for (const auto &R : NativeReductions) {
|
||||
|
||||
MVT AccType = std::get<0>(R);
|
||||
MVT InputType = std::get<1>(R);
|
||||
unsigned Factor = std::get<2>(R) ? 2 : 1;
|
||||
|
||||
// The native size is legal.
|
||||
setPartialReduceMLAAction(MLAOps, HvxType(AccType), HvxType(InputType),
|
||||
Legal);
|
||||
|
||||
// Allow custom partial MLA reductions on larger vectors than legally
|
||||
// supported. These reduction must be declared as Custom (or Legal)
|
||||
// for foldPartialReduceMLAMulOp() to fold the multiply by one pattern
|
||||
// inserted when the partial reduction intrinsic is converted to
|
||||
// PARTIAL_REDUCE_U/S/SUMLA. Otherwise, the Split action will apply
|
||||
// on the original pattern, including the extensions and multiplies,
|
||||
// which will make it impossible to match.
|
||||
// There are two independent ways to extend the
|
||||
// input size: 1. to concatenate the result - output vector is
|
||||
// proportionally extended, 2) to reduce the result - the output vector
|
||||
// size stays the same. We limit allowed combinations so that the total
|
||||
// number of generated reduction instructions is limited by a constant
|
||||
// number. This limit is arbitrary and can be revised. On one hand, it is
|
||||
// convenient to have more choices; on the other hand, there is a
|
||||
// diminishing benefit of very long sequences, which should probably be
|
||||
// written as loops instead.
|
||||
for (unsigned ConcatFactor = 1; ConcatFactor <= MaxExpandMLA;
|
||||
ConcatFactor <<= 1)
|
||||
for (unsigned ReductionFactor = 1; ReductionFactor <= MaxExpandMLA;
|
||||
ReductionFactor <<= 1)
|
||||
if (ConcatFactor * ReductionFactor != 1 &&
|
||||
ConcatFactor * ReductionFactor <= MaxExpandMLA)
|
||||
setPartialReduceMLAAction(
|
||||
MLAOps, HvxType(AccType, Factor * ConcatFactor),
|
||||
HvxType(InputType, Factor * ConcatFactor * ReductionFactor),
|
||||
Custom);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned
|
||||
@ -3678,6 +3743,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case HexagonISD::SMUL_LOHI:
|
||||
case HexagonISD::UMUL_LOHI:
|
||||
case HexagonISD::USMUL_LOHI: return LowerHvxMulLoHi(Op, DAG);
|
||||
|
||||
case ISD::PARTIAL_REDUCE_SMLA:
|
||||
case ISD::PARTIAL_REDUCE_UMLA:
|
||||
case ISD::PARTIAL_REDUCE_SUMLA:
|
||||
return LowerHvxPartialReduceMLA(Op, DAG);
|
||||
// clang-format on
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
@ -4020,6 +4090,198 @@ HexagonTargetLowering::combineConcatVectorsBeforeLegal(
|
||||
return DAG.getVectorShuffle(LongTy, dl, Cat, DAG.getUNDEF(LongTy), LongMask);
|
||||
}
|
||||
|
||||
// Create the inner partial reduction MLA that can be efficiently lowered. This
|
||||
// function is used by partial and full reductions.
|
||||
SDValue HexagonTargetLowering::createExtendingPartialReduceMLA(
|
||||
unsigned Opcode, EVT AccEltType, unsigned AccNumElements, EVT InputType,
|
||||
const SDValue &A, const SDValue &B, unsigned &RemainingReductionRatio,
|
||||
const SDLoc &DL, SelectionDAG &DAG) const {
|
||||
const auto &Subtarget = DAG.getSubtarget<HexagonSubtarget>();
|
||||
if (!Subtarget.useHVXOps())
|
||||
return SDValue();
|
||||
|
||||
EVT InputEltType = InputType.getVectorElementType();
|
||||
|
||||
// Find if an optimized instruction for the sub-reduction is available.
|
||||
unsigned NativeRatio;
|
||||
if (AccEltType == MVT::i32 && InputEltType == MVT::i8)
|
||||
NativeRatio = 4;
|
||||
else
|
||||
return SDValue();
|
||||
|
||||
// We only handle the case when additional reduction will be needed, i.e.
|
||||
// input is longer by a larger factor than the result.
|
||||
ElementCount InputEC = InputType.getVectorElementCount();
|
||||
if (!InputEC.isKnownMultipleOf(AccNumElements * NativeRatio))
|
||||
return SDValue();
|
||||
|
||||
unsigned InputNumElements = InputEC.getFixedValue();
|
||||
RemainingReductionRatio = InputNumElements / (AccNumElements * NativeRatio);
|
||||
if (RemainingReductionRatio == 1)
|
||||
return SDValue();
|
||||
|
||||
// Create a reduction by the natively supported factor.
|
||||
EVT IntermediateType = EVT::getVectorVT(*DAG.getContext(), AccEltType,
|
||||
InputNumElements / NativeRatio);
|
||||
|
||||
SDValue Zero = DAG.getConstant(0, DL, IntermediateType);
|
||||
return DAG.getNode(Opcode, DL, IntermediateType, Zero, A, B);
|
||||
}
|
||||
|
||||
static bool DetectExtendingMultiply(const SDValue &N, EVT ScalarType,
|
||||
unsigned &Opcode, SDValue &A, SDValue &B) {
|
||||
SDValue Mul = N;
|
||||
EVT AccType = Mul.getValueType(); // Vector input type after extension.
|
||||
if (ScalarType != AccType.getVectorElementType())
|
||||
return false;
|
||||
bool swap = false;
|
||||
if (Mul->getOpcode() != ISD::MUL)
|
||||
return false;
|
||||
A = Mul->getOperand(0);
|
||||
B = Mul->getOperand(1);
|
||||
if (A.getOpcode() == ISD::ZERO_EXTEND) {
|
||||
if (B.getOpcode() == ISD::ZERO_EXTEND)
|
||||
Opcode = ISD::PARTIAL_REDUCE_UMLA;
|
||||
else if (B.getOpcode() == ISD::SIGN_EXTEND) {
|
||||
swap = true;
|
||||
Opcode = ISD::PARTIAL_REDUCE_SUMLA;
|
||||
} else
|
||||
return false;
|
||||
} else if (A.getOpcode() == ISD::SIGN_EXTEND) {
|
||||
if (B.getOpcode() == ISD::ZERO_EXTEND)
|
||||
Opcode = ISD::PARTIAL_REDUCE_SUMLA;
|
||||
else if (B.getOpcode() == ISD::SIGN_EXTEND)
|
||||
Opcode = ISD::PARTIAL_REDUCE_SMLA;
|
||||
else
|
||||
return false;
|
||||
} else
|
||||
return false;
|
||||
|
||||
// Get multiplication arguments before extension.
|
||||
A = A->getOperand(0);
|
||||
B = B->getOperand(0);
|
||||
if (A.getValueType() != B.getValueType())
|
||||
return false;
|
||||
|
||||
if (swap)
|
||||
std::swap(A, B);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
SDValue HexagonTargetLowering::splitVecReduceAdd(SDNode *N,
|
||||
SelectionDAG &DAG) const {
|
||||
if (!Subtarget.useHVXOps())
|
||||
return SDValue();
|
||||
|
||||
EVT ScalarType = N->getValueType(0);
|
||||
unsigned Opcode;
|
||||
SDValue A, B;
|
||||
if (!DetectExtendingMultiply(N->getOperand(0), ScalarType, Opcode, A, B))
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(N);
|
||||
unsigned RemainingReductionRatio;
|
||||
SDValue Partial =
|
||||
createExtendingPartialReduceMLA(Opcode, ScalarType, 1, A.getValueType(),
|
||||
A, B, RemainingReductionRatio, DL, DAG);
|
||||
if (!Partial)
|
||||
return SDValue();
|
||||
|
||||
// We could have inserted a trivial MLA and rely on the folding action,
|
||||
// similar to how vector_partial_reduce_add is lowered to an MLA in
|
||||
// SelectionDAGBuilder. However, we just replace the final result since we
|
||||
// have analyzed the input completely.
|
||||
return DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarType, Partial);
|
||||
}
|
||||
|
||||
// When possible, separate an MLA reduction with extended operands but
|
||||
// unsupported reduction factor into an extending partial reduction that
|
||||
// can be efficiently lowered, and a follow-up partial reduction.
|
||||
// partial_reduce_mla(a, x, y) ->
|
||||
// partial_reduce_mla(a, partial_reduce_mla(0, x, y), 1)
|
||||
SDValue
|
||||
HexagonTargetLowering::splitExtendingPartialReduceMLA(SDNode *N,
|
||||
SelectionDAG &DAG) const {
|
||||
if (!Subtarget.useHVXOps())
|
||||
return SDValue();
|
||||
|
||||
SDValue Acc = N->getOperand(0);
|
||||
SDValue A = N->getOperand(1);
|
||||
SDValue B = N->getOperand(2);
|
||||
if (A.getValueType() != B.getValueType())
|
||||
return SDValue();
|
||||
|
||||
// The types should be declared as custom, but do not split already legal
|
||||
// operation.
|
||||
EVT AccType = Acc.getValueType();
|
||||
EVT InputType = A.getValueType();
|
||||
if (getPartialReduceMLAAction(N->getOpcode(), AccType, InputType) != Custom)
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(N);
|
||||
unsigned RemainingReductionRatio;
|
||||
SDValue Partial = createExtendingPartialReduceMLA(
|
||||
N->getOpcode(), AccType.getVectorElementType(),
|
||||
AccType.getVectorNumElements(), InputType, A, B, RemainingReductionRatio,
|
||||
DL, DAG);
|
||||
if (!Partial)
|
||||
return SDValue();
|
||||
assert(RemainingReductionRatio <= MaxExpandMLA);
|
||||
|
||||
// Create the reduction for the remaining ratio.
|
||||
EVT IntermediateType = Partial->getOperand(0).getValueType();
|
||||
SDValue One = DAG.getConstant(1, DL, IntermediateType);
|
||||
return DAG.getNode(N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
|
||||
? ISD::PARTIAL_REDUCE_UMLA
|
||||
: ISD::PARTIAL_REDUCE_SUMLA,
|
||||
DL, AccType, Acc, Partial, One);
|
||||
}
|
||||
|
||||
SDValue
|
||||
HexagonTargetLowering::LowerHvxPartialReduceMLA(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
const SDLoc &DL(Op);
|
||||
SDValue Acc = Op.getOperand(0);
|
||||
SDValue A = Op.getOperand(1);
|
||||
SDValue B = Op.getOperand(2);
|
||||
|
||||
// Split the input vectors into units of one HVX vector length.
|
||||
unsigned HwVectorSizeInBits = Subtarget.getVectorLength() * 8;
|
||||
|
||||
EVT AccType = Acc.getValueType();
|
||||
EVT AccEltType = AccType.getVectorElementType();
|
||||
unsigned AccSubvectorNumElements =
|
||||
HwVectorSizeInBits / AccEltType.getSizeInBits();
|
||||
EVT AccSubvectorType =
|
||||
EVT::getVectorVT(*DAG.getContext(), AccEltType, AccSubvectorNumElements);
|
||||
|
||||
EVT InputType = A.getValueType();
|
||||
assert(InputType.getSizeInBits() % HwVectorSizeInBits == 0);
|
||||
EVT InputEltType = InputType.getVectorElementType();
|
||||
unsigned InputSubvectorNumElements =
|
||||
HwVectorSizeInBits / InputEltType.getSizeInBits();
|
||||
EVT InputSubvectorType = EVT::getVectorVT(*DAG.getContext(), InputEltType,
|
||||
InputSubvectorNumElements);
|
||||
|
||||
unsigned SubvectorNum = InputType.getFixedSizeInBits() / HwVectorSizeInBits;
|
||||
SmallVector<SDValue, MaxExpandMLA> Subvectors;
|
||||
|
||||
for (unsigned I = 0; I != SubvectorNum; ++I) {
|
||||
SDValue SubvectorAcc = DAG.getExtractSubvector(DL, AccSubvectorType, Acc,
|
||||
I * AccSubvectorNumElements);
|
||||
SDValue SubvectorA = DAG.getExtractSubvector(DL, InputSubvectorType, A,
|
||||
I * InputSubvectorNumElements);
|
||||
SDValue SubvectorB = DAG.getExtractSubvector(DL, InputSubvectorType, B,
|
||||
I * InputSubvectorNumElements);
|
||||
SDValue SubvectorMLA = DAG.getNode(Op.getOpcode(), DL, AccSubvectorType,
|
||||
SubvectorAcc, SubvectorA, SubvectorB);
|
||||
Subvectors.push_back(SubvectorMLA);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, AccType, Subvectors);
|
||||
}
|
||||
|
||||
SDValue
|
||||
HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
|
||||
const {
|
||||
@ -4039,43 +4301,33 @@ HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
|
||||
return SDValue();
|
||||
|
||||
switch (Opc) {
|
||||
case ISD::VSELECT: {
|
||||
// (vselect (xor x, qtrue), v0, v1) -> (vselect x, v1, v0)
|
||||
SDValue Cond = Ops[0];
|
||||
if (Cond->getOpcode() == ISD::XOR) {
|
||||
SDValue C0 = Cond.getOperand(0), C1 = Cond.getOperand(1);
|
||||
if (C1->getOpcode() == HexagonISD::QTRUE)
|
||||
return DAG.getNode(ISD::VSELECT, dl, ty(Op), C0, Ops[2], Ops[1]);
|
||||
}
|
||||
break;
|
||||
case HexagonISD::V2Q:
|
||||
if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
|
||||
if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
|
||||
return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
|
||||
: DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
|
||||
}
|
||||
case HexagonISD::V2Q:
|
||||
if (Ops[0].getOpcode() == ISD::SPLAT_VECTOR) {
|
||||
if (const auto *C = dyn_cast<ConstantSDNode>(Ops[0].getOperand(0)))
|
||||
return C->isZero() ? DAG.getNode(HexagonISD::QFALSE, dl, ty(Op))
|
||||
: DAG.getNode(HexagonISD::QTRUE, dl, ty(Op));
|
||||
}
|
||||
break;
|
||||
case HexagonISD::Q2V:
|
||||
if (Ops[0].getOpcode() == HexagonISD::QTRUE)
|
||||
return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
|
||||
DAG.getAllOnesConstant(dl, MVT::i32));
|
||||
if (Ops[0].getOpcode() == HexagonISD::QFALSE)
|
||||
return getZero(dl, ty(Op), DAG);
|
||||
break;
|
||||
case HexagonISD::VINSERTW0:
|
||||
if (isUndef(Ops[1]))
|
||||
return Ops[0];
|
||||
break;
|
||||
case HexagonISD::VROR: {
|
||||
if (Ops[0].getOpcode() == HexagonISD::VROR) {
|
||||
SDValue Vec = Ops[0].getOperand(0);
|
||||
SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
|
||||
SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
|
||||
return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
|
||||
}
|
||||
break;
|
||||
break;
|
||||
case HexagonISD::Q2V:
|
||||
if (Ops[0].getOpcode() == HexagonISD::QTRUE)
|
||||
return DAG.getNode(ISD::SPLAT_VECTOR, dl, ty(Op),
|
||||
DAG.getAllOnesConstant(dl, MVT::i32));
|
||||
if (Ops[0].getOpcode() == HexagonISD::QFALSE)
|
||||
return getZero(dl, ty(Op), DAG);
|
||||
break;
|
||||
case HexagonISD::VINSERTW0:
|
||||
if (isUndef(Ops[1]))
|
||||
return Ops[0];
|
||||
break;
|
||||
case HexagonISD::VROR: {
|
||||
if (Ops[0].getOpcode() == HexagonISD::VROR) {
|
||||
SDValue Vec = Ops[0].getOperand(0);
|
||||
SDValue Rot0 = Ops[1], Rot1 = Ops[0].getOperand(1);
|
||||
SDValue Rot = DAG.getNode(ISD::ADD, dl, ty(Rot0), {Rot0, Rot1});
|
||||
return DAG.getNode(HexagonISD::VROR, dl, ty(Op), {Vec, Rot});
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
||||
@ -456,6 +456,13 @@ let Predicates = [UseHVX] in {
|
||||
(VShuff (V6_vmpyhus_acc (VDeal $Vx, -4),
|
||||
HVI16:$Vs, HVI16:$Vt), -4)>;
|
||||
}
|
||||
|
||||
def : Pat<(VecI32 (partial_reduce_umla VecI32:$Acc, HVI8:$A, HVI8:$B)),
|
||||
(V6_vrmpyubv_acc $Acc, $A, $B)>;
|
||||
def : Pat<(VecI32 (partial_reduce_smla VecI32:$Acc, HVI8:$A, HVI8:$B)),
|
||||
(V6_vrmpybv_acc $Acc, $A, $B)>;
|
||||
def : Pat<(VecI32 (partial_reduce_sumla VecI32:$Acc, HVI8:$A, HVI8:$B)),
|
||||
(V6_vrmpybusv_acc $Acc, $B, $A)>;
|
||||
}
|
||||
|
||||
let Predicates = [UseHVX] in {
|
||||
|
||||
@ -327,6 +327,14 @@ InstructionCost HexagonTTIImpl::getVectorInstrCost(
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool HexagonTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::vector_reduce_add:
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/,
|
||||
unsigned /*AddressSpace*/,
|
||||
TTI::MaskKind /*MaskKind*/) const {
|
||||
|
||||
@ -156,7 +156,7 @@ public:
|
||||
const Instruction *I = nullptr) const override {
|
||||
return 1;
|
||||
}
|
||||
|
||||
bool shouldExpandReduction(const IntrinsicInst *II) const override;
|
||||
bool isLegalMaskedStore(Type *DataType, Align Alignment,
|
||||
unsigned AddressSpace,
|
||||
TTI::MaskKind MaskKind) const override;
|
||||
|
||||
143
llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll
Normal file
143
llvm/test/CodeGen/Hexagon/expand-vecreduce-add.ll
Normal file
@ -0,0 +1,143 @@
|
||||
; RUN: llc -mtriple=hexagon < %s | FileCheck %s
|
||||
|
||||
target triple = "hexagon"
|
||||
|
||||
define i32 @add_v32i32(<32 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v32i32:
|
||||
; CHECK: {
|
||||
; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R8:v[0-9]+]] = valign([[_:v[0-9]+]],[[R7]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R9:v[0-9]+]].w = vadd([[R7]].w,[[R8]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: r0 = vextract([[R9]],{{.+}})
|
||||
; CHECK: }
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @add_v16i32(<16 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v16i32:
|
||||
; CHECK: {
|
||||
; CHECK: [[R0:v[0-9]+]] = valign([[_:v[0-9]+]],v0,{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R1:v[0-9]+]].w = vadd(v0.w,[[R0]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R2:v[0-9]+]] = valign([[_:v[0-9]+]],[[R1]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R3:v[0-9]+]].w = vadd([[R1]].w,[[R2]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R4:v[0-9]+]] = valign([[_:v[0-9]+]],[[R3]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R5:v[0-9]+]].w = vadd([[R3]].w,[[R4]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R6:v[0-9]+]] = valign([[_:v[0-9]+]],[[R5]],{{.+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: [[R7:v[0-9]+]].w = vadd([[R5]].w,[[R6]].w)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: r0 = vextract([[R7]],{{.+}})
|
||||
; CHECK: }
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @add_v8i32(<8 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v8i32:
|
||||
; CHECK: {
|
||||
; CHECK: r[[RS1:[0-9]+:[0-9]+]] = vaddw(r1:0,r5:4)
|
||||
; CHECK: r[[R6:[0-9]+:[0-9]+]] = memd(r29+#0)
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: r[[RS2:[0-9]+:[0-9]+]] = vaddw(r3:2,r[[R6]])
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: r[[RS3:[0-9]+:[0-9]+]] = vaddw(r[[RS1]],r[[RS2]])
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
;; TODO: combine and double register add can be optimized to single register add.
|
||||
; CHECK: r[[RS4:[0-9]+:[0-9]+]] = combine(#0,r{{[0-9]+}})
|
||||
; CHECK: }
|
||||
; CHECK: {
|
||||
; CHECK: r1:0 = vaddw(r[[RS3]],r[[RS4]])
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @add_v64i32(<64 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v64i32:
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
;; Non-pow2 vectors are scalarized.
|
||||
|
||||
define i32 @add_v12i32(<12 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v12i32:
|
||||
; CHECK: [[RS0:r[0-9]+]] = add(r0,r1)
|
||||
; CHECK: [[RS1:r[0-9]+]] += add([[RS0]],r{{[0-9]+}})
|
||||
; CHECK: [[RS2:r[0-9]+]] += add([[RS1]],r{{[0-9]+}})
|
||||
; CHECK: [[RS3:r[0-9]+]] += add([[RS2]],r{{[0-9]+}})
|
||||
; CHECK: [[RS4:r[0-9]+]] += add([[RS3]],r{{[0-9]+}})
|
||||
; CHECK: [[RS5:r[0-9]+]] += add([[RS4]],r{{[0-9]+}})
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v12i32(<12 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
define i32 @add_v3i32(<3 x i32> %vec) #0 {
|
||||
; CHECK-LABEL: add_v3i32:
|
||||
; CHECK: r{{[0-9]+}} += add(r{{[0-9]+}},r{{[0-9]+}})
|
||||
entry:
|
||||
%r = call i32 @llvm.vector.reduce.add.v3i32(<3 x i32> %vec)
|
||||
ret i32 %r
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }
|
||||
145
llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll
Normal file
145
llvm/test/CodeGen/Hexagon/hvx-full-reduce.ll
Normal file
@ -0,0 +1,145 @@
|
||||
; RUN: llc -mtriple=hexagon < %s | FileCheck %s
|
||||
|
||||
define i32 @full_reduce_i32_128i8_uu(<128 x i8> %x, <128 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_128i8_uu:
|
||||
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
|
||||
; CHECK: [[A]].uw += vrmpy(v0.ub,v1.ub)
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: vextract
|
||||
%x.wide = zext <128 x i8> %x to <128 x i32>
|
||||
%y.wide = zext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
define i32 @full_reduce_i32_128i8_su(<128 x i8> %x, <128 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_128i8_su:
|
||||
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
|
||||
; CHECK: [[A]].w += vrmpy(v1.ub,v0.b)
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: vextract
|
||||
%x.wide = sext <128 x i8> %x to <128 x i32>
|
||||
%y.wide = zext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
define i32 @full_reduce_i32_128i8_us(<128 x i8> %x, <128 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_128i8_us:
|
||||
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
|
||||
; CHECK: [[A]].w += vrmpy(v0.ub,v1.b)
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: vextract
|
||||
%x.wide = zext <128 x i8> %x to <128 x i32>
|
||||
%y.wide = sext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
define i32 @full_reduce_i32_128i8_ss(<128 x i8> %x, <128 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_128i8_ss:
|
||||
; CHECK: [[A:v[0-9]+]] = vxor([[Z:v[0-9]+]],[[Z]])
|
||||
; CHECK: [[A]].w += vrmpy(v0.b,v1.b)
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: vextract
|
||||
%x.wide = sext <128 x i8> %x to <128 x i32>
|
||||
%y.wide = sext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
;; Double-vector input.
|
||||
|
||||
define i32 @full_reduce_i32_256i8(<256 x i8> %x, <256 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_256i8:
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
%x.wide = zext <256 x i8> %x to <256 x i32>
|
||||
%y.wide = zext <256 x i8> %y to <256 x i32>
|
||||
%m = mul nuw nsw <256 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v256i32(<256 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
;; Maximum handled vector size.
|
||||
|
||||
define i32 @full_reduce_i32_1024i8(<1024 x i8> %x, <1024 x i8> %y) #0 {
|
||||
; CHECK-LABEL: full_reduce_i32_1024i8:
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vrmpy
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
; CHECK: valign
|
||||
; CHECK: vadd
|
||||
%x.wide = zext <1024 x i8> %x to <1024 x i32>
|
||||
%y.wide = zext <1024 x i8> %y to <1024 x i32>
|
||||
%m = mul nuw nsw <1024 x i32> %x.wide, %y.wide
|
||||
%reduce = tail call i32 @llvm.vector.reduce.add.v1024i32(<1024 x i32> %m)
|
||||
ret i32 %reduce
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone "target-cpu"="hexagonv68" "target-features"="+hvx,+hvx-length128b" }
|
||||
162
llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll
Normal file
162
llvm/test/CodeGen/Hexagon/hvx-partial-reduce.ll
Normal file
@ -0,0 +1,162 @@
|
||||
;; Check HVX vectorization.
|
||||
; RUN: llc -mtriple hexagon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-HVX
|
||||
|
||||
;; Check that there is no failure when compiling to scalar code, don't check the output.
|
||||
; RUN: llc -mtriple hexagon -mattr=-hvx,-hvxv73,-hvx-length128b < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-HVX
|
||||
|
||||
define <16 x i32> @partial_reduce_uu_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
|
||||
; CHECK-LABEL: partial_reduce_uu_64:
|
||||
; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <64 x i8> %x to <64 x i32>
|
||||
%y.ext = zext <64 x i8> %y to <64 x i32>
|
||||
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
|
||||
ret <16 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <16 x i32> @partial_reduce_su_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
|
||||
; CHECK-LABEL: partial_reduce_su_64:
|
||||
; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = sext <64 x i8> %x to <64 x i32>
|
||||
%y.ext = zext <64 x i8> %y to <64 x i32>
|
||||
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
|
||||
ret <16 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <16 x i32> @partial_reduce_us_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
|
||||
; CHECK-LABEL: partial_reduce_us_64:
|
||||
; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <64 x i8> %x to <64 x i32>
|
||||
%y.ext = sext <64 x i8> %y to <64 x i32>
|
||||
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
|
||||
ret <16 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <16 x i32> @partial_reduce_ss_64(<16 x i32> %acc, <64 x i8> %x, <64 x i8> %y) #0 {
|
||||
; CHECK-LABEL: partial_reduce_ss_64:
|
||||
; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = sext <64 x i8> %x to <64 x i32>
|
||||
%y.ext = sext <64 x i8> %y to <64 x i32>
|
||||
%m = mul nuw nsw <64 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <16 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<16 x i32> %acc, <64 x i32> %m)
|
||||
ret <16 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <32 x i32> @partial_reduce_uu_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_uu_128:
|
||||
; CHECK-HVX: v0.uw += vrmpy(v1.ub,v2.ub)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <128 x i8> %x to <128 x i32>
|
||||
%y.ext = zext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <32 x i32> @partial_reduce_su_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_su_128:
|
||||
; CHECK-HVX: v0.w += vrmpy(v2.ub,v1.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = sext <128 x i8> %x to <128 x i32>
|
||||
%y.ext = zext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <32 x i32> @partial_reduce_us_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_us_128:
|
||||
; CHECK-HVX: v0.w += vrmpy(v1.ub,v2.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <128 x i8> %x to <128 x i32>
|
||||
%y.ext = sext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <32 x i32> @partial_reduce_ss_128(<32 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_ss_128:
|
||||
; CHECK-HVX: v0.w += vrmpy(v1.b,v2.b)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = sext <128 x i8> %x to <128 x i32>
|
||||
%y.ext = sext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v128i32(<32 x i32> %acc, <128 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
;; Multiple-size inputs, same output size.
|
||||
define <32 x i32> @partial_reduce_uu_32xi32_256xi8(<32 x i32> %acc, <256 x i8> %x, <256 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_uu_32xi32_256xi8:
|
||||
; CHECK-HVX: [[R1:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
|
||||
; CHECK-HVX: [[R2:v[0-9]+]].uw += vrmpy({{v[0-9]+}}.ub,{{v[0-9]+}}.ub)
|
||||
; CHECK-HVX: [[R3:v[0-9]+]].w = vadd(v0.w,[[R1]].w)
|
||||
; CHECK-HVX: v0.w = vadd([[R2]].w,[[R3]].w)
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <256 x i8> %x to <256 x i32>
|
||||
%y.ext = zext <256 x i8> %y to <256 x i32>
|
||||
%m = mul nuw nsw <256 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v256i32(<32 x i32> %acc, <256 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <32 x i32> @partial_reduce_uu_32xi32_1024xi8(<32 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_uu_32xi32_1024xi8:
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-HVX-DAG: vrmpy
|
||||
; CHECK-HVX-DAG: vadd
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
%x.ext = zext <1024 x i8> %x to <1024 x i32>
|
||||
%y.ext = zext <1024 x i8> %y to <1024 x i32>
|
||||
%m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <32 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<32 x i32> %acc, <1024 x i32> %m)
|
||||
ret <32 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
define <256 x i32> @partial_reduce_uu_64xi32_1024xi8(<256 x i32> %acc, <1024 x i8> %x, <1024 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_uu_64xi32_1024xi8:
|
||||
; CHECK-HVX-COUNT-8: vrmpy
|
||||
; CHECK-HVX-NOT: vadd
|
||||
; CHECK-NO-HVX: {{r[0-9]+}} += mpyi
|
||||
; CHECK-HVX: dealloc_return
|
||||
%x.ext = zext <1024 x i8> %x to <1024 x i32>
|
||||
%y.ext = zext <1024 x i8> %y to <1024 x i32>
|
||||
%m = mul nuw nsw <1024 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <256 x i32> @llvm.vector.partial.reduce.add.v32i32.v1024i32(<256 x i32> %acc, <1024 x i32> %m)
|
||||
ret <256 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
;; Check for vector size that do not match an available vrmpy (2x reduction).
|
||||
define <64 x i32> @partial_reduce_unsupported(<64 x i32> %acc, <128 x i8> %x, <128 x i8> %y) #1 {
|
||||
; CHECK-LABEL: partial_reduce_unsupported:
|
||||
; CHECK-HVX: vmpy
|
||||
; CHECK-HVX: vadd
|
||||
%x.ext = zext <128 x i8> %x to <128 x i32>
|
||||
%y.ext = zext <128 x i8> %y to <128 x i32>
|
||||
%m = mul nuw nsw <128 x i32> %x.ext, %y.ext
|
||||
%partial.reduce = tail call <64 x i32> @llvm.vector.partial.reduce.add.v64i32.v128i32(<64 x i32> %acc, <128 x i32> %m)
|
||||
ret <64 x i32> %partial.reduce
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length64b" }
|
||||
attributes #1 = { nounwind "target-cpu"="hexagonv73" "target-features"="+hvxv73,+hvx-length128b" }
|
||||
Loading…
x
Reference in New Issue
Block a user