[PowerPC] vector shift word/double by element size - 1 use all ones (#139794)
Vector shift word or double requires a shift amount vector of 31 or 63 which is too big for splat immediate and requires a multi-instruction sequence. However the PPC instructions only use 5 or 6 bits of the shift amount vector elements so an all ones mask, which we can generate efficiently, works.
This commit is contained in:
parent
3c9812eeea
commit
bbca78fbcb
@ -18456,36 +18456,80 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue PPCTargetLowering::combineVectorSHL(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
EVT VT = N->getValueType(0);
|
||||
assert(VT.isVector() && "Vector type expected.");
|
||||
|
||||
unsigned Opc = N->getOpcode();
|
||||
assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
|
||||
"Unexpected opcode.");
|
||||
|
||||
if (!isOperationLegal(Opc, VT))
|
||||
return SDValue();
|
||||
|
||||
EVT EltTy = VT.getScalarType();
|
||||
unsigned EltBits = EltTy.getSizeInBits();
|
||||
if (EltTy != MVT::i64 && EltTy != MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
SDValue N1 = N->getOperand(1);
|
||||
if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR ||
|
||||
!isOperationLegal(ISD::ADD, VT))
|
||||
uint64_t SplatBits = 0;
|
||||
bool AddSplatCase = false;
|
||||
unsigned OpcN1 = N1.getOpcode();
|
||||
if (OpcN1 == PPCISD::VADD_SPLAT &&
|
||||
N1.getConstantOperandVal(1) == VT.getVectorNumElements()) {
|
||||
AddSplatCase = true;
|
||||
SplatBits = N1.getConstantOperandVal(0);
|
||||
}
|
||||
|
||||
if (!AddSplatCase) {
|
||||
if (OpcN1 != ISD::BUILD_VECTOR)
|
||||
return SDValue();
|
||||
|
||||
unsigned SplatBitSize;
|
||||
bool HasAnyUndefs;
|
||||
APInt APSplatBits, APSplatUndef;
|
||||
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
|
||||
bool BVNIsConstantSplat =
|
||||
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
|
||||
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
|
||||
if (!BVNIsConstantSplat || SplatBitSize != EltBits)
|
||||
return SDValue();
|
||||
SplatBits = APSplatBits.getZExtValue();
|
||||
}
|
||||
|
||||
SDLoc DL(N);
|
||||
SDValue N0 = N->getOperand(0);
|
||||
// PPC vector shifts by word/double look at only the low 5/6 bits of the
|
||||
// shift vector, which means the max value is 31/63. A shift vector of all
|
||||
// 1s will be truncated to 31/63, which is useful as vspltiw is limited to
|
||||
// -16 to 15 range.
|
||||
if (SplatBits == (EltBits - 1)) {
|
||||
unsigned NewOpc;
|
||||
switch (Opc) {
|
||||
case ISD::SHL:
|
||||
NewOpc = PPCISD::SHL;
|
||||
break;
|
||||
case ISD::SRL:
|
||||
NewOpc = PPCISD::SRL;
|
||||
break;
|
||||
case ISD::SRA:
|
||||
NewOpc = PPCISD::SRA;
|
||||
break;
|
||||
}
|
||||
SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL);
|
||||
return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes);
|
||||
}
|
||||
|
||||
if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT))
|
||||
return SDValue();
|
||||
|
||||
// For 64-bit there is no splat immediate so we want to catch shift by 1 here
|
||||
// before the BUILD_VECTOR is replaced by a load.
|
||||
EVT EltTy = VT.getScalarType();
|
||||
if (EltTy != MVT::i64)
|
||||
if (EltTy != MVT::i64 || SplatBits != 1)
|
||||
return SDValue();
|
||||
|
||||
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
|
||||
APInt APSplatBits, APSplatUndef;
|
||||
unsigned SplatBitSize;
|
||||
bool HasAnyUndefs;
|
||||
bool BVNIsConstantSplat =
|
||||
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
|
||||
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
|
||||
if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits())
|
||||
return SDValue();
|
||||
uint64_t SplatBits = APSplatBits.getZExtValue();
|
||||
if (SplatBits != 1)
|
||||
return SDValue();
|
||||
|
||||
SDValue N0 = N->getOperand(0);
|
||||
return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
|
||||
}
|
||||
|
||||
@ -18494,7 +18538,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
return Value;
|
||||
|
||||
if (N->getValueType(0).isVector())
|
||||
return combineVectorSHL(N, DCI);
|
||||
return combineVectorShift(N, DCI);
|
||||
|
||||
SDValue N0 = N->getOperand(0);
|
||||
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
|
||||
@ -18526,6 +18570,9 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
|
||||
return Value;
|
||||
|
||||
if (N->getValueType(0).isVector())
|
||||
return combineVectorShift(N, DCI);
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
@ -18533,6 +18580,9 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
|
||||
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
|
||||
return Value;
|
||||
|
||||
if (N->getValueType(0).isVector())
|
||||
return combineVectorShift(N, DCI);
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
||||
@ -1441,7 +1441,7 @@ namespace llvm {
|
||||
SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) {
|
||||
ret <4 x i32> %tmp.1
|
||||
}
|
||||
; CHECK-LABEL: test7_v4i32:
|
||||
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
|
||||
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
|
||||
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
|
||||
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
|
||||
; CHECK-NOT: vmul
|
||||
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
|
||||
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
|
||||
|
||||
define <4 x i32> @test8_v4i32(<4 x i32> %a) {
|
||||
%tmp.1 = mul nsw <4 x i32> %a, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> ; <<4 x i32>> [#uses=1]
|
||||
ret <4 x i32> %tmp.1
|
||||
}
|
||||
; CHECK-LABEL: test8_v4i32:
|
||||
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
|
||||
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
|
||||
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
|
||||
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
|
||||
; CHECK-NOT: vmul
|
||||
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
|
||||
; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2
|
||||
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
|
||||
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2
|
||||
|
||||
define <2 x i64> @test1_v2i64(<2 x i64> %a) {
|
||||
%tmp.1 = mul nsw <2 x i64> %a, <i64 16, i64 16> ; <<2 x i64>> [#uses=1]
|
||||
@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test7_v2i64:
|
||||
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
|
||||
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
|
||||
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
|
||||
; CHECK-NOT: vmul
|
||||
; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]]
|
||||
|
||||
@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test8_v2i64:
|
||||
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
|
||||
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
|
||||
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
|
||||
; CHECK-NOT: vmul
|
||||
; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
|
||||
; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2
|
||||
|
||||
@ -7,13 +7,11 @@
|
||||
define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: poly2_lshift1:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: addis r6, r2, .LCPI0_0@toc@ha
|
||||
; CHECK-NEXT: ld r6, 0(r3)
|
||||
; CHECK-NEXT: li r4, 72
|
||||
; CHECK-NEXT: ld r5, 64(r3)
|
||||
; CHECK-NEXT: addi r6, r6, .LCPI0_0@toc@l
|
||||
; CHECK-NEXT: xxleqv v4, v4, v4
|
||||
; CHECK-NEXT: lxvd2x vs0, r3, r4
|
||||
; CHECK-NEXT: lxvd2x v4, 0, r6
|
||||
; CHECK-NEXT: ld r6, 0(r3)
|
||||
; CHECK-NEXT: sldi r7, r6, 1
|
||||
; CHECK-NEXT: rotldi r6, r6, 1
|
||||
; CHECK-NEXT: std r7, 0(r3)
|
||||
@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
|
||||
; CHECK-NEXT: std r7, 32(r3)
|
||||
; CHECK-NEXT: ld r7, 40(r3)
|
||||
; CHECK-NEXT: rldimi r6, r7, 1, 0
|
||||
; CHECK-NEXT: xxswapd v2, vs0
|
||||
; CHECK-NEXT: mtfprd f0, r5
|
||||
; CHECK-NEXT: rotldi r7, r7, 1
|
||||
; CHECK-NEXT: std r6, 40(r3)
|
||||
; CHECK-NEXT: ld r6, 48(r3)
|
||||
; CHECK-NEXT: xxswapd v2, vs0
|
||||
; CHECK-NEXT: mtfprd f0, r5
|
||||
; CHECK-NEXT: rldimi r7, r6, 1, 0
|
||||
; CHECK-NEXT: rotldi r6, r6, 1
|
||||
; CHECK-NEXT: std r7, 48(r3)
|
||||
|
||||
@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) {
|
||||
define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
|
||||
; CHECK-LABEL: add_lshr_not_vec_splat:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI15_0@toc@ha
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
|
||||
; CHECK-NEXT: xxleqv 35, 35, 35
|
||||
; CHECK-NEXT: vsraw 2, 2, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
|
||||
; CHECK-NEXT: lxvd2x 35, 0, 3
|
||||
; CHECK-NEXT: vadduwm 2, 2, 3
|
||||
; CHECK-NEXT: blr
|
||||
@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) {
|
||||
define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
|
||||
; CHECK-LABEL: sub_lshr_not_vec_splat:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
|
||||
; CHECK-NEXT: xxleqv 35, 35, 35
|
||||
; CHECK-NEXT: vsrw 2, 2, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
|
||||
; CHECK-NEXT: lxvd2x 35, 0, 3
|
||||
; CHECK-NEXT: vadduwm 2, 2, 3
|
||||
; CHECK-NEXT: blr
|
||||
@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
|
||||
define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
|
||||
; CHECK-LABEL: sub_lshr_vec:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vspltisw 4, -16
|
||||
; CHECK-NEXT: vspltisw 5, 15
|
||||
; CHECK-NEXT: vsubuwm 4, 5, 4
|
||||
; CHECK-NEXT: xxleqv 36, 36, 36
|
||||
; CHECK-NEXT: vsraw 2, 2, 4
|
||||
; CHECK-NEXT: vadduwm 2, 3, 2
|
||||
; CHECK-NEXT: blr
|
||||
@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) {
|
||||
define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
|
||||
; CHECK-LABEL: sub_const_op_lshr_vec:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI21_0@toc@ha
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
|
||||
; CHECK-NEXT: xxleqv 35, 35, 35
|
||||
; CHECK-NEXT: vsraw 2, 2, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
|
||||
; CHECK-NEXT: lxvd2x 35, 0, 3
|
||||
; CHECK-NEXT: vadduwm 2, 2, 3
|
||||
; CHECK-NEXT: blr
|
||||
|
||||
@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
|
||||
; CHECK-LABEL: sel_C1_or_C2_vec:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: xxleqv 37, 37, 37
|
||||
; CHECK-NEXT: vslw 2, 2, 5
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: vsraw 2, 2, 5
|
||||
; CHECK-NEXT: lxvd2x 0, 0, 3
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha
|
||||
; CHECK-NEXT: vslw 2, 2, 3
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l
|
||||
; CHECK-NEXT: vsraw 2, 2, 3
|
||||
; CHECK-NEXT: xxswapd 37, 0
|
||||
; CHECK-NEXT: xxswapd 35, 0
|
||||
; CHECK-NEXT: lxvd2x 0, 0, 3
|
||||
; CHECK-NEXT: xxswapd 32, 0
|
||||
; CHECK-NEXT: xxsel 34, 32, 37, 34
|
||||
; CHECK-NEXT: xxswapd 36, 0
|
||||
; CHECK-NEXT: xxsel 34, 36, 35, 34
|
||||
; CHECK-NEXT: blr
|
||||
%add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
|
||||
ret <4 x i32> %add
|
||||
@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
|
||||
; CHECK-LABEL: sel_Cminus1_or_C_vec:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: xxleqv 36, 36, 36
|
||||
; CHECK-NEXT: vslw 2, 2, 4
|
||||
; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: vsraw 2, 2, 4
|
||||
; CHECK-NEXT: lxvd2x 0, 0, 3
|
||||
; CHECK-NEXT: vslw 2, 2, 3
|
||||
; CHECK-NEXT: vsraw 2, 2, 3
|
||||
; CHECK-NEXT: xxswapd 37, 0
|
||||
; CHECK-NEXT: vadduwm 2, 2, 5
|
||||
; CHECK-NEXT: xxswapd 35, 0
|
||||
; CHECK-NEXT: vadduwm 2, 2, 3
|
||||
; CHECK-NEXT: blr
|
||||
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
|
||||
ret <4 x i32> %add
|
||||
@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
|
||||
define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
|
||||
; CHECK-LABEL: sel_minus1_or_0_vec:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: vspltisw 3, -16
|
||||
; CHECK-NEXT: vspltisw 4, 15
|
||||
; CHECK-NEXT: vsubuwm 3, 4, 3
|
||||
; CHECK-NEXT: xxleqv 35, 35, 35
|
||||
; CHECK-NEXT: vslw 2, 2, 3
|
||||
; CHECK-NEXT: vsraw 2, 2, 3
|
||||
; CHECK-NEXT: blr
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user