[PowerPC] vector shift word/double by element size - 1 use all ones (#139794)

Vector shift word or double requires a shift amount vector of 31 or 63
which is too big for splat immediate and requires a multi-instruction
sequence. However the PPC instructions only use 5 or 6 bits of the shift
amount vector elements so an all ones mask, which we can generate
efficiently, works.
This commit is contained in:
RolandF77 2025-05-23 10:49:37 -04:00 committed by GitHub
parent 3c9812eeea
commit bbca78fbcb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 102 additions and 74 deletions

View File

@ -18456,36 +18456,80 @@ static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N,
return SDValue();
}
SDValue PPCTargetLowering::combineVectorSHL(SDNode *N,
DAGCombinerInfo &DCI) const {
SDValue PPCTargetLowering::combineVectorShift(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Vector type expected.");
unsigned Opc = N->getOpcode();
assert((Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA) &&
"Unexpected opcode.");
if (!isOperationLegal(Opc, VT))
return SDValue();
EVT EltTy = VT.getScalarType();
unsigned EltBits = EltTy.getSizeInBits();
if (EltTy != MVT::i64 && EltTy != MVT::i32)
return SDValue();
SDValue N1 = N->getOperand(1);
if (!Subtarget.hasP8Altivec() || N1.getOpcode() != ISD::BUILD_VECTOR ||
!isOperationLegal(ISD::ADD, VT))
uint64_t SplatBits = 0;
bool AddSplatCase = false;
unsigned OpcN1 = N1.getOpcode();
if (OpcN1 == PPCISD::VADD_SPLAT &&
N1.getConstantOperandVal(1) == VT.getVectorNumElements()) {
AddSplatCase = true;
SplatBits = N1.getConstantOperandVal(0);
}
if (!AddSplatCase) {
if (OpcN1 != ISD::BUILD_VECTOR)
return SDValue();
unsigned SplatBitSize;
bool HasAnyUndefs;
APInt APSplatBits, APSplatUndef;
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
bool BVNIsConstantSplat =
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
if (!BVNIsConstantSplat || SplatBitSize != EltBits)
return SDValue();
SplatBits = APSplatBits.getZExtValue();
}
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
// PPC vector shifts by word/double look at only the low 5/6 bits of the
// shift vector, which means the max value is 31/63. A shift vector of all
// 1s will be truncated to 31/63, which is useful as vspltiw is limited to
// -16 to 15 range.
if (SplatBits == (EltBits - 1)) {
unsigned NewOpc;
switch (Opc) {
case ISD::SHL:
NewOpc = PPCISD::SHL;
break;
case ISD::SRL:
NewOpc = PPCISD::SRL;
break;
case ISD::SRA:
NewOpc = PPCISD::SRA;
break;
}
SDValue SplatOnes = getCanonicalConstSplat(255, 1, VT, DCI.DAG, DL);
return DCI.DAG.getNode(NewOpc, DL, VT, N0, SplatOnes);
}
if (Opc != ISD::SHL || !isOperationLegal(ISD::ADD, VT))
return SDValue();
// For 64-bit there is no splat immediate so we want to catch shift by 1 here
// before the BUILD_VECTOR is replaced by a load.
EVT EltTy = VT.getScalarType();
if (EltTy != MVT::i64)
if (EltTy != MVT::i64 || SplatBits != 1)
return SDValue();
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(N1);
APInt APSplatBits, APSplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
bool BVNIsConstantSplat =
BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
HasAnyUndefs, 0, !Subtarget.isLittleEndian());
if (!BVNIsConstantSplat || SplatBitSize != EltTy.getSizeInBits())
return SDValue();
uint64_t SplatBits = APSplatBits.getZExtValue();
if (SplatBits != 1)
return SDValue();
SDValue N0 = N->getOperand(0);
return DCI.DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
@ -18494,7 +18538,7 @@ SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const {
return Value;
if (N->getValueType(0).isVector())
return combineVectorSHL(N, DCI);
return combineVectorShift(N, DCI);
SDValue N0 = N->getOperand(0);
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
@ -18526,6 +18570,9 @@ SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
if (N->getValueType(0).isVector())
return combineVectorShift(N, DCI);
return SDValue();
}
@ -18533,6 +18580,9 @@ SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const {
if (auto Value = stripModuloOnShift(*this, N, DCI.DAG))
return Value;
if (N->getValueType(0).isVector())
return combineVectorShift(N, DCI);
return SDValue();
}

View File

@ -1441,7 +1441,7 @@ namespace llvm {
SDValue combineStoreFPToInt(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineVectorSHL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineVectorShift(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;

View File

@ -252,23 +252,19 @@ define <4 x i32> @test7_v4i32(<4 x i32> %a) {
ret <4 x i32> %tmp.1
}
; CHECK-LABEL: test7_v4i32:
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
; CHECK-NOT: vmul
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
define <4 x i32> @test8_v4i32(<4 x i32> %a) {
%tmp.1 = mul nsw <4 x i32> %a, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647> ; <<4 x i32>> [#uses=1]
ret <4 x i32> %tmp.1
}
; CHECK-LABEL: test8_v4i32:
; CHECK-DAG: vspltisw v[[REG2:[0-9]+]], -16
; CHECK-DAG: vspltisw v[[REG3:[0-9]+]], 15
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v[[REG2]]
; CHECK: xxleqv v[[REG1:[0-9]+]], v[[REG2:[0-9]+]], v[[REG2]]
; CHECK-NOT: vmul
; CHECK-NEXT: vslw v[[REG5:[0-9]+]], v2, v[[REG4]]
; CHECK-NEXT: vsubuwm v[[REG6:[0-9]+]], v[[REG5]], v2
; CHECK-NEXT: vslw v[[REG3:[0-9]+]], v2, v[[REG1]]
; CHECK-NEXT: vsubuwm v[[REG4:[0-9]+]], v[[REG3]], v2
define <2 x i64> @test1_v2i64(<2 x i64> %a) {
%tmp.1 = mul nsw <2 x i64> %a, <i64 16, i64 16> ; <<2 x i64>> [#uses=1]
@ -356,8 +352,7 @@ define <2 x i64> @test7_v2i64(<2 x i64> %a) {
}
; CHECK-LABEL: test7_v2i64:
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
; CHECK-NOT: vmul
; CHECK-NEXT: vsld v[[REG4:[0-9]+]], v2, v[[REG2]]
@ -367,8 +362,7 @@ define <2 x i64> @test8_v2i64(<2 x i64> %a) {
}
; CHECK-LABEL: test8_v2i64:
; CHECK-P8: lxvd2x v[[REG1:[0-9]+]], 0, r{{[0-9]+}}
; CHECK-P9: lxv v[[REG2:[0-9]+]], 0(r{{[0-9]+}})
; CHECK: xxleqv v[[REG2:[0-9]+]], v[[REG1:[0-9]+]], v[[REG1]]
; CHECK-NOT: vmul
; CHECK-NEXT: vsld v[[REG3:[0-9]+]], v2, v[[REG2]]
; CHECK-NEXT: vsubudm v{{[0-9]+}}, v[[REG3]], v2

View File

@ -7,13 +7,11 @@
define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
; CHECK-LABEL: poly2_lshift1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r6, r2, .LCPI0_0@toc@ha
; CHECK-NEXT: ld r6, 0(r3)
; CHECK-NEXT: li r4, 72
; CHECK-NEXT: ld r5, 64(r3)
; CHECK-NEXT: addi r6, r6, .LCPI0_0@toc@l
; CHECK-NEXT: xxleqv v4, v4, v4
; CHECK-NEXT: lxvd2x vs0, r3, r4
; CHECK-NEXT: lxvd2x v4, 0, r6
; CHECK-NEXT: ld r6, 0(r3)
; CHECK-NEXT: sldi r7, r6, 1
; CHECK-NEXT: rotldi r6, r6, 1
; CHECK-NEXT: std r7, 0(r3)
@ -35,11 +33,11 @@ define dso_local void @poly2_lshift1(ptr nocapture %p) local_unnamed_addr #0 {
; CHECK-NEXT: std r7, 32(r3)
; CHECK-NEXT: ld r7, 40(r3)
; CHECK-NEXT: rldimi r6, r7, 1, 0
; CHECK-NEXT: xxswapd v2, vs0
; CHECK-NEXT: mtfprd f0, r5
; CHECK-NEXT: rotldi r7, r7, 1
; CHECK-NEXT: std r6, 40(r3)
; CHECK-NEXT: ld r6, 48(r3)
; CHECK-NEXT: xxswapd v2, vs0
; CHECK-NEXT: mtfprd f0, r5
; CHECK-NEXT: rldimi r7, r6, 1, 0
; CHECK-NEXT: rotldi r6, r6, 1
; CHECK-NEXT: std r7, 48(r3)

View File

@ -188,12 +188,10 @@ define i32 @add_lshr_not(i32 %x) {
define <4 x i32> @add_lshr_not_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: add_lshr_not_vec_splat:
; CHECK: # %bb.0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI15_0@toc@ha
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: addi 3, 3, .LCPI15_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
@ -218,12 +216,10 @@ define i32 @sub_lshr_not(i32 %x) {
define <4 x i32> @sub_lshr_not_vec_splat(<4 x i32> %x) {
; CHECK-LABEL: sub_lshr_not_vec_splat:
; CHECK: # %bb.0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI17_0@toc@ha
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsrw 2, 2, 3
; CHECK-NEXT: addi 3, 3, .LCPI17_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
@ -247,9 +243,7 @@ define i32 @sub_lshr(i32 %x, i32 %y) {
define <4 x i32> @sub_lshr_vec(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: sub_lshr_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: vspltisw 4, -16
; CHECK-NEXT: vspltisw 5, 15
; CHECK-NEXT: vsubuwm 4, 5, 4
; CHECK-NEXT: xxleqv 36, 36, 36
; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: vadduwm 2, 3, 2
; CHECK-NEXT: blr
@ -272,12 +266,10 @@ define i32 @sub_const_op_lshr(i32 %x) {
define <4 x i32> @sub_const_op_lshr_vec(<4 x i32> %x) {
; CHECK-LABEL: sub_const_op_lshr_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: addis 3, 2, .LCPI21_0@toc@ha
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: addi 3, 3, .LCPI21_0@toc@l
; CHECK-NEXT: lxvd2x 35, 0, 3
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr

View File

@ -11,19 +11,17 @@ define <4 x i32> @sel_C1_or_C2_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_C1_or_C2_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI0_0@toc@ha
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: xxleqv 37, 37, 37
; CHECK-NEXT: vslw 2, 2, 5
; CHECK-NEXT: addi 3, 3, .LCPI0_0@toc@l
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vsraw 2, 2, 5
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: addis 3, 2, .LCPI0_1@toc@ha
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: addi 3, 3, .LCPI0_1@toc@l
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxswapd 37, 0
; CHECK-NEXT: xxswapd 35, 0
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: xxswapd 32, 0
; CHECK-NEXT: xxsel 34, 32, 37, 34
; CHECK-NEXT: xxswapd 36, 0
; CHECK-NEXT: xxsel 34, 36, 35, 34
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 3000, i32 1, i32 -1, i32 0>, <4 x i32> <i32 42, i32 0, i32 -2, i32 -1>
ret <4 x i32> %add
@ -82,15 +80,13 @@ define <4 x i32> @sel_Cminus1_or_C_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_Cminus1_or_C_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: addis 3, 2, .LCPI4_0@toc@ha
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: xxleqv 36, 36, 36
; CHECK-NEXT: vslw 2, 2, 4
; CHECK-NEXT: addi 3, 3, .LCPI4_0@toc@l
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: vsraw 2, 2, 4
; CHECK-NEXT: lxvd2x 0, 0, 3
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: xxswapd 37, 0
; CHECK-NEXT: vadduwm 2, 2, 5
; CHECK-NEXT: xxswapd 35, 0
; CHECK-NEXT: vadduwm 2, 2, 3
; CHECK-NEXT: blr
%add = select <4 x i1> %cond, <4 x i32> <i32 43, i32 1, i32 -1, i32 0>, <4 x i32> <i32 44, i32 2, i32 0, i32 1>
ret <4 x i32> %add
@ -114,9 +110,7 @@ define <4 x i32> @cmp_sel_Cminus1_or_C_vec(<4 x i32> %x, <4 x i32> %y) {
define <4 x i32> @sel_minus1_or_0_vec(<4 x i1> %cond) {
; CHECK-LABEL: sel_minus1_or_0_vec:
; CHECK: # %bb.0:
; CHECK-NEXT: vspltisw 3, -16
; CHECK-NEXT: vspltisw 4, 15
; CHECK-NEXT: vsubuwm 3, 4, 3
; CHECK-NEXT: xxleqv 35, 35, 35
; CHECK-NEXT: vslw 2, 2, 3
; CHECK-NEXT: vsraw 2, 2, 3
; CHECK-NEXT: blr