From d9d71bdc14c510a258f9ad61be9ad4a3db735f49 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 21 Aug 2025 09:54:42 +0100 Subject: [PATCH] [AArch64] Move BSL generation to lowering. (#151855) It is generally better to allow the target independent combines before creating AArch64 specific nodes (providing they don't mess it up). This moves the generation of BSL nodes to lowering, not a combine, so that intermediate nodes are more likely to be optimized. There is a small change in the constant handling to detect legalized buildvector arguments correctly. Fixes #149380 but not directly. #151856 contained a direct fix for expanding the pseudos. --- .../Target/AArch64/AArch64ISelLowering.cpp | 30 ++---- .../AArch64/GlobalISel/combine-udiv.ll | 12 +-- llvm/test/CodeGen/AArch64/combine-sdiv.ll | 99 +++++++------------ .../AArch64/neon-bitwise-instructions.ll | 73 ++++++++++---- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll | 7 +- 5 files changed, 100 insertions(+), 121 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d168cc8d1bd0..f6b214078f58 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1121,7 +1121,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); - // We combine OR nodes for bitfield operations. + // We combine OR nodes for ccmp operations. setTargetDAGCombine(ISD::OR); // Try to create BICs for vector ANDs. setTargetDAGCombine(ISD::AND); @@ -14799,23 +14799,15 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } -static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, - const AArch64TargetLowering &TLI) { +static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; + assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n"); SDLoc DL(N); const auto &Subtarget = DAG.getSubtarget(); - if (!VT.isVector()) - return SDValue(); - if (VT.isScalableVector() && !Subtarget.hasSVE2()) return SDValue(); - if (VT.isFixedLengthVector() && - (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT))) - return SDValue(); - SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND) return SDValue(); @@ -14865,14 +14857,13 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getScalarSizeInBits(); - uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { APInt Val1, Val2; if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) && ISD::isConstantSplatVector(N1->getOperand(j).getNode(), Val2) && - (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) { + ~Val1.trunc(Bits) == Val2.trunc(Bits)) { return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i), N0->getOperand(1 - i), N1->getOperand(1 - j)); } @@ -14886,7 +14877,8 @@ static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); if (!CN0 || !CN1 || - CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { + CN0->getAPIntValue().trunc(Bits) != + ~CN1->getAsAPIntVal().trunc(Bits)) { FoundMatch = false; break; } @@ -14905,6 +14897,9 @@ SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, !Subtarget->isNeonAvailable())) return LowerToScalableOp(Op, DAG); + if (SDValue Res = tryLowerToBSL(Op, DAG)) + return Res; + // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; @@ -19658,17 +19653,10 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI) { SelectionDAG &DAG = DCI.DAG; - EVT VT = N->getValueType(0); if (SDValue R = performANDORCSELCombine(N, DAG)) return R; - if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - if (SDValue Res = tryCombineToBSL(N, DCI, TLI)) - return Res; - return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index b681e3b22311..7872c027aff2 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -155,16 +155,12 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; SDAG: // %bb.0: ; SDAG-NEXT: movi v1.16b, #171 ; SDAG-NEXT: adrp x8, .LCPI4_0 -; SDAG-NEXT: adrp x9, .LCPI4_1 -; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI4_1] ; SDAG-NEXT: umull2 v2.8h, v0.16b, v1.16b ; SDAG-NEXT: umull v1.8h, v0.8b, v1.8b -; SDAG-NEXT: and v0.16b, v0.16b, v3.16b ; SDAG-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI4_0] ; SDAG-NEXT: ushr v1.16b, v1.16b, #7 -; SDAG-NEXT: and v1.16b, v1.16b, v2.16b -; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b +; SDAG-NEXT: bif v0.16b, v1.16b, v2.16b ; SDAG-NEXT: ret ; ; GISEL-LABEL: combine_vec_udiv_nonuniform4: @@ -192,7 +188,6 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SDAG-LABEL: pr38477: ; SDAG: // %bb.0: ; SDAG-NEXT: adrp x8, .LCPI5_0 -; SDAG-NEXT: adrp x9, .LCPI5_4 ; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI5_0] ; SDAG-NEXT: adrp x8, .LCPI5_1 ; SDAG-NEXT: ldr q3, [x8, :lo12:.LCPI5_1] @@ -203,16 +198,13 @@ define <8 x i16> @pr38477(<8 x i16> %a0) { ; SDAG-NEXT: sub v2.8h, v0.8h, v1.8h ; SDAG-NEXT: umull2 v4.4s, v2.8h, v3.8h ; SDAG-NEXT: umull v2.4s, v2.4h, v3.4h -; SDAG-NEXT: ldr q3, [x9, :lo12:.LCPI5_4] -; SDAG-NEXT: and v0.16b, v0.16b, v3.16b ; SDAG-NEXT: uzp2 v2.8h, v2.8h, v4.8h ; SDAG-NEXT: add v1.8h, v2.8h, v1.8h ; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_2] ; SDAG-NEXT: adrp x8, .LCPI5_3 ; SDAG-NEXT: ushl v1.8h, v1.8h, v2.8h ; SDAG-NEXT: ldr q2, [x8, :lo12:.LCPI5_3] -; SDAG-NEXT: and v1.16b, v1.16b, v2.16b -; SDAG-NEXT: orr v0.16b, v0.16b, v1.16b +; SDAG-NEXT: bif v0.16b, v1.16b, v2.16b ; SDAG-NEXT: ret ; ; GISEL-LABEL: pr38477: diff --git a/llvm/test/CodeGen/AArch64/combine-sdiv.ll b/llvm/test/CodeGen/AArch64/combine-sdiv.ll index 6208a697cab1..9d0ade248042 100644 --- a/llvm/test/CodeGen/AArch64/combine-sdiv.ll +++ b/llvm/test/CodeGen/AArch64/combine-sdiv.ll @@ -230,14 +230,11 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { ; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff ; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_0] ; CHECK-SD-NEXT: adrp x8, .LCPI14_1 -; CHECK-SD-NEXT: movi v4.2d, #0xffffffffffffff00 ; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI14_1] ; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: and v1.16b, v1.16b, v4.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v3.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i8: @@ -265,21 +262,17 @@ define <16 x i8> @combine_vec_sdiv_by_pow2b_v16i8(<16 x i8> %x) { define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v8i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI15_1 +; CHECK-SD-NEXT: adrp x8, .LCPI15_0 ; CHECK-SD-NEXT: cmlt v1.8h, v0.8h, #0 -; CHECK-SD-NEXT: adrp x9, .LCPI15_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] +; CHECK-SD-NEXT: adrp x8, .LCPI15_1 +; CHECK-SD-NEXT: ushl v1.8h, v1.8h, v2.8h ; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_1] ; CHECK-SD-NEXT: adrp x8, .LCPI15_2 -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI15_3] -; CHECK-SD-NEXT: ushl v1.8h, v1.8h, v2.8h -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] -; CHECK-SD-NEXT: adrp x8, .LCPI15_0 ; CHECK-SD-NEXT: add v1.8h, v0.8h, v1.8h ; CHECK-SD-NEXT: sshl v1.8h, v1.8h, v2.8h -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_0] -; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI15_2] +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v8i16: @@ -308,28 +301,22 @@ define <8 x i16> @combine_vec_sdiv_by_pow2b_v8i16(<8 x i16> %x) { define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v16i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI16_1 +; CHECK-SD-NEXT: adrp x8, .LCPI16_0 ; CHECK-SD-NEXT: cmlt v2.8h, v0.8h, #0 ; CHECK-SD-NEXT: cmlt v3.8h, v1.8h, #0 -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_1] -; CHECK-SD-NEXT: adrp x8, .LCPI16_2 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_0] +; CHECK-SD-NEXT: adrp x8, .LCPI16_1 ; CHECK-SD-NEXT: ushl v2.8h, v2.8h, v4.8h ; CHECK-SD-NEXT: ushl v3.8h, v3.8h, v4.8h -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] -; CHECK-SD-NEXT: adrp x8, .LCPI16_0 -; CHECK-SD-NEXT: ldr q5, [x8, :lo12:.LCPI16_0] -; CHECK-SD-NEXT: adrp x8, .LCPI16_3 +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_1] +; CHECK-SD-NEXT: adrp x8, .LCPI16_2 ; CHECK-SD-NEXT: add v2.8h, v0.8h, v2.8h ; CHECK-SD-NEXT: add v3.8h, v1.8h, v3.8h -; CHECK-SD-NEXT: and v0.16b, v0.16b, v5.16b -; CHECK-SD-NEXT: and v1.16b, v1.16b, v5.16b ; CHECK-SD-NEXT: sshl v2.8h, v2.8h, v4.8h ; CHECK-SD-NEXT: sshl v3.8h, v3.8h, v4.8h -; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_3] -; CHECK-SD-NEXT: and v2.16b, v2.16b, v4.16b -; CHECK-SD-NEXT: and v3.16b, v3.16b, v4.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v3.16b +; CHECK-SD-NEXT: ldr q4, [x8, :lo12:.LCPI16_2] +; CHECK-SD-NEXT: bif v0.16b, v2.16b, v4.16b +; CHECK-SD-NEXT: bif v1.16b, v3.16b, v4.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v16i16: @@ -363,42 +350,32 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) { define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) { ; CHECK-SD-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI17_1 +; CHECK-SD-NEXT: adrp x8, .LCPI17_0 ; CHECK-SD-NEXT: cmlt v4.8h, v0.8h, #0 ; CHECK-SD-NEXT: cmlt v5.8h, v1.8h, #0 ; CHECK-SD-NEXT: cmlt v7.8h, v2.8h, #0 ; CHECK-SD-NEXT: cmlt v16.8h, v3.8h, #0 -; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI17_1] -; CHECK-SD-NEXT: adrp x8, .LCPI17_2 +; CHECK-SD-NEXT: ldr q6, [x8, :lo12:.LCPI17_0] +; CHECK-SD-NEXT: adrp x8, .LCPI17_1 ; CHECK-SD-NEXT: ushl v4.8h, v4.8h, v6.8h ; CHECK-SD-NEXT: ushl v5.8h, v5.8h, v6.8h ; CHECK-SD-NEXT: ushl v7.8h, v7.8h, v6.8h ; CHECK-SD-NEXT: ushl v6.8h, v16.8h, v6.8h -; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] -; CHECK-SD-NEXT: adrp x8, .LCPI17_0 +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_1] +; CHECK-SD-NEXT: adrp x8, .LCPI17_2 ; CHECK-SD-NEXT: add v4.8h, v0.8h, v4.8h ; CHECK-SD-NEXT: add v5.8h, v1.8h, v5.8h -; CHECK-SD-NEXT: ldr q17, [x8, :lo12:.LCPI17_0] ; CHECK-SD-NEXT: add v7.8h, v2.8h, v7.8h ; CHECK-SD-NEXT: add v6.8h, v3.8h, v6.8h -; CHECK-SD-NEXT: adrp x8, .LCPI17_3 -; CHECK-SD-NEXT: and v0.16b, v0.16b, v17.16b -; CHECK-SD-NEXT: and v1.16b, v1.16b, v17.16b -; CHECK-SD-NEXT: and v2.16b, v2.16b, v17.16b ; CHECK-SD-NEXT: sshl v4.8h, v4.8h, v16.8h ; CHECK-SD-NEXT: sshl v5.8h, v5.8h, v16.8h -; CHECK-SD-NEXT: and v3.16b, v3.16b, v17.16b ; CHECK-SD-NEXT: sshl v7.8h, v7.8h, v16.8h ; CHECK-SD-NEXT: sshl v6.8h, v6.8h, v16.8h -; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_3] -; CHECK-SD-NEXT: and v4.16b, v4.16b, v16.16b -; CHECK-SD-NEXT: and v5.16b, v5.16b, v16.16b -; CHECK-SD-NEXT: and v7.16b, v7.16b, v16.16b -; CHECK-SD-NEXT: and v6.16b, v6.16b, v16.16b -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v4.16b -; CHECK-SD-NEXT: orr v1.16b, v1.16b, v5.16b -; CHECK-SD-NEXT: orr v2.16b, v2.16b, v7.16b -; CHECK-SD-NEXT: orr v3.16b, v3.16b, v6.16b +; CHECK-SD-NEXT: ldr q16, [x8, :lo12:.LCPI17_2] +; CHECK-SD-NEXT: bif v0.16b, v4.16b, v16.16b +; CHECK-SD-NEXT: bif v1.16b, v5.16b, v16.16b +; CHECK-SD-NEXT: bif v2.16b, v7.16b, v16.16b +; CHECK-SD-NEXT: bif v3.16b, v6.16b, v16.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_vec_sdiv_by_pow2b_v32i16: @@ -904,29 +881,21 @@ define <16 x i8> @non_splat_minus_one_divisor_0(<16 x i8> %A) { define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) { ; CHECK-SD-LABEL: non_splat_minus_one_divisor_1: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: adrp x8, .LCPI26_1 +; CHECK-SD-NEXT: adrp x8, .LCPI26_0 ; CHECK-SD-NEXT: cmlt v1.16b, v0.16b, #0 -; CHECK-SD-NEXT: adrp x9, .LCPI26_3 +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] +; CHECK-SD-NEXT: adrp x8, .LCPI26_1 +; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_1] ; CHECK-SD-NEXT: adrp x8, .LCPI26_2 -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_3] -; CHECK-SD-NEXT: adrp x9, .LCPI26_5 -; CHECK-SD-NEXT: ushl v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] -; CHECK-SD-NEXT: adrp x8, .LCPI26_0 ; CHECK-SD-NEXT: add v1.16b, v0.16b, v1.16b ; CHECK-SD-NEXT: sshl v1.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] -; CHECK-SD-NEXT: adrp x8, .LCPI26_4 -; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_4] -; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: ldr q3, [x9, :lo12:.LCPI26_5] -; CHECK-SD-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_2] +; CHECK-SD-NEXT: adrp x8, .LCPI26_3 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI26_3] ; CHECK-SD-NEXT: neg v1.16b, v0.16b -; CHECK-SD-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-SD-NEXT: and v1.16b, v1.16b, v3.16b -; CHECK-SD-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-SD-NEXT: bit v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: non_splat_minus_one_divisor_1: diff --git a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll index 7e6f3548bdaf..0c84468f3934 100644 --- a/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -884,8 +884,10 @@ define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) { define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { ; CHECK-SD-LABEL: bsl2xi32_const: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi d2, #0x000000ffffffff -; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v0.s[1], v1.s[1] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bsl2xi32_const: @@ -923,8 +925,10 @@ define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { ; CHECK-SD-LABEL: bsl1xi64_const: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi d2, #0xffffffffffffff00 -; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: mov v0.b[0], v1.b[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: bsl1xi64_const: @@ -981,12 +985,17 @@ define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { } define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { -; CHECK-LABEL: bsl2xi64_const: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI75_0 -; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] -; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-NEXT: ret +; CHECK-SD-LABEL: bsl2xi64_const: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov v0.d[1], v1.d[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bsl2xi64_const: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI75_0 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI75_0] +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: ret %tmp1 = and <2 x i64> %a, < i64 -1, i64 0 > %tmp2 = and <2 x i64> %b, < i64 0, i64 -1 > %tmp3 = or <2 x i64> %tmp1, %tmp2 @@ -1158,11 +1167,8 @@ define <4 x i32> @vselect_constant_cond_zero_v4i32(<4 x i32> %a) { define <8 x i8> @vselect_constant_cond_v8i8(<8 x i8> %a, <8 x i8> %b) { ; CHECK-SD-LABEL: vselect_constant_cond_v8i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi d2, #0xffffffffff00ff00 -; CHECK-SD-NEXT: movi d3, #0x00000000ff00ff -; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-SD-NEXT: and v0.8b, v0.8b, v3.8b -; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: movi d2, #0x00000000ff00ff +; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vselect_constant_cond_v8i8: @@ -1180,11 +1186,8 @@ define <8 x i8> @vselect_constant_cond_v8i8(<8 x i8> %a, <8 x i8> %b) { define <4 x i16> @vselect_constant_cond_v4i16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-SD-LABEL: vselect_constant_cond_v4i16: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: movi d2, #0x00ffffffff0000 -; CHECK-SD-NEXT: movi d3, #0xffff00000000ffff -; CHECK-SD-NEXT: and v1.8b, v1.8b, v2.8b -; CHECK-SD-NEXT: and v0.8b, v0.8b, v3.8b -; CHECK-SD-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-SD-NEXT: movi d2, #0xffff00000000ffff +; CHECK-SD-NEXT: bif v0.8b, v1.8b, v2.8b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vselect_constant_cond_v4i16: @@ -2839,3 +2842,33 @@ define <2 x i64> @orr64imm8h_lsl8(<2 x i64> %a) { ret <2 x i64> %tmp1 } +define <8 x i16> @pr149380(<4 x i16> %u1, <1 x i64> %u2, <8 x i16> %vqshlu_n169) { +; CHECK-SD-LABEL: pr149380: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: movi v0.8h, #1 +; CHECK-SD-NEXT: orr v2.8h, #1 +; CHECK-SD-NEXT: sqadd v0.8h, v2.8h, v0.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: pr149380: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: movi v0.2d, #0xffffffffffffffff +; CHECK-GI-NEXT: neg v1.8h, v2.8h +; CHECK-GI-NEXT: movi v3.8h, #1 +; CHECK-GI-NEXT: neg v1.8h, v1.8h +; CHECK-GI-NEXT: sub v0.8h, v0.8h, v2.8h +; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-GI-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: sqadd v0.8h, v3.8h, v0.8h +; CHECK-GI-NEXT: ret +entry: + %mul.i = mul <8 x i16> %vqshlu_n169, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %sub.i = sub <8 x i16> zeroinitializer, %mul.i + %vbsl3.i = and <8 x i16> %sub.i, %vqshlu_n169 + %0 = add <8 x i16> %mul.i, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %vbsl4.i = and <8 x i16> %0, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1 > + %vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i + %vqaddq_v2.i26515 = tail call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1 >, <8 x i16> %vbsl5.i) + ret <8 x i16> %vqaddq_v2.i26515 +} diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll index 468a33ce5bfc..4be8c3775c70 100644 --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -88,7 +88,6 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; CHECK-LABEL: dont_fold_urem_one: ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, .LCPI4_0 -; CHECK-NEXT: movi d4, #0x0000000000ffff ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI4_0] ; CHECK-NEXT: adrp x8, .LCPI4_1 ; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI4_1] @@ -97,16 +96,14 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) { ; CHECK-NEXT: shrn v1.4h, v1.4s, #16 ; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h ; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h -; CHECK-NEXT: movi d3, #0xffffffffffff0000 +; CHECK-NEXT: movi d3, #0x0000000000ffff ; CHECK-NEXT: shrn v2.4h, v2.4s, #16 ; CHECK-NEXT: add v1.4h, v2.4h, v1.4h ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_2] ; CHECK-NEXT: adrp x8, .LCPI4_3 ; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h -; CHECK-NEXT: and v2.8b, v0.8b, v4.8b -; CHECK-NEXT: and v1.8b, v1.8b, v3.8b -; CHECK-NEXT: orr v1.8b, v2.8b, v1.8b ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI4_3] +; CHECK-NEXT: bit v1.8b, v0.8b, v3.8b ; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x,