diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ae63b09bf33e..423c4969783c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1993,22 +1993,25 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); // We can lower types that have elements to compact. + for (auto VT : + {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + + // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest + // bits of the SVE register. + for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, + MVT::v2f64}) + setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); + for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32}) { - setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); // Use a custom lowering for masked stores that could be a supported // compressing store. Note: These types still use the normal (Legal) // lowering for non-compressing masked stores. setOperationAction(ISD::MSTORE, VT, Custom); } - // If we have SVE, we can use SVE logic for legal (or smaller than legal) - // NEON vectors in the lowest bits of the SVE register. - for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32, - MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32}) - setOperationAction(ISD::VECTOR_COMPRESS, VT, Custom); - // Histcnt is SVE2 only if (Subtarget->hasSVE2()) { setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32, @@ -7413,92 +7416,36 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op, return SDValue(); } -// Convert to ContainerVT with no-op casts where possible. -static SDValue convertToSVEContainerType(SDLoc DL, SDValue Vec, EVT ContainerVT, - SelectionDAG &DAG) { - EVT VecVT = Vec.getValueType(); - if (VecVT.isFloatingPoint()) { - // Use no-op casts for floating-point types. - EVT PackedVT = getPackedSVEVectorVT(VecVT.getScalarType()); - Vec = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedVT, Vec); - Vec = DAG.getNode(AArch64ISD::NVCAST, DL, ContainerVT, Vec); - } else { - // Extend integers (may not be a no-op). - Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec); - } - return Vec; -} +SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE( + SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); -// Convert to VecVT with no-op casts where possible. -static SDValue convertFromSVEContainerType(SDLoc DL, SDValue Vec, EVT VecVT, - SelectionDAG &DAG) { - if (VecVT.isFloatingPoint()) { - // Use no-op casts for floating-point types. - EVT PackedVT = getPackedSVEVectorVT(VecVT.getScalarType()); - Vec = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVT, Vec); - Vec = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VecVT, Vec); - } else { - // Truncate integers (may not be a no-op). - Vec = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Vec); - } - return Vec; + EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT); + SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); + SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG); + SDValue Passthru = + convertToScalableVector(DAG, ContainerVT, Op.getOperand(2)); + + SDValue Result = + DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru); + return convertFromScalableVector(DAG, VT, Result); } SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + if (!Subtarget->isSVEAvailable()) + return SDValue(); + + if (VT.isFixedLengthVector()) + return LowerFixedLengthVectorCompressToSVE(Op, DAG); + SDLoc DL(Op); SDValue Vec = Op.getOperand(0); SDValue Mask = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); - EVT VecVT = Vec.getValueType(); EVT MaskVT = Mask.getValueType(); - EVT ElmtVT = VecVT.getVectorElementType(); - const bool IsFixedLength = VecVT.isFixedLengthVector(); - const bool HasPassthru = !Passthru.isUndef(); - unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue(); - EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts); - - assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector."); - - if (!Subtarget->isSVEAvailable()) - return SDValue(); - - if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128) - return SDValue(); - - // Only supported for compact. - if (MinElmts != 2 && MinElmts != 4) - return SDValue(); - - // We can use the SVE register containing the NEON vector in its lowest bits. - if (IsFixedLength) { - EVT ScalableVecVT = - MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts); - EVT ScalableMaskVT = MVT::getScalableVectorVT( - MaskVT.getVectorElementType().getSimpleVT(), MinElmts); - - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, - DAG.getUNDEF(ScalableVecVT), Vec, - DAG.getConstant(0, DL, MVT::i64)); - Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT, - DAG.getUNDEF(ScalableMaskVT), Mask, - DAG.getConstant(0, DL, MVT::i64)); - Mask = DAG.getNode(ISD::TRUNCATE, DL, - ScalableMaskVT.changeVectorElementType(MVT::i1), Mask); - Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT, - DAG.getUNDEF(ScalableVecVT), Passthru, - DAG.getConstant(0, DL, MVT::i64)); - - VecVT = Vec.getValueType(); - MaskVT = Mask.getValueType(); - } - - // Get legal type for compact instruction - EVT ContainerVT = getSVEContainerType(VecVT); - - // Convert to 32 or 64 bits for smaller types, as these are the only supported - // sizes for compact. - Vec = convertToSVEContainerType(DL, Vec, ContainerVT, DAG); SDValue Compressed = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, Vec.getValueType(), @@ -7506,40 +7453,21 @@ SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op, Vec); // compact fills with 0s, so if our passthru is all 0s, do nothing here. - if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) { - SDValue Offset = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, - DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, - Mask); + if (Passthru.isUndef() || + ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) + return Compressed; - SDValue IndexMask = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MaskVT, - DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64), - DAG.getConstant(0, DL, MVT::i64), Offset); + SDValue CntActive = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, + DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask, + Mask); - Compressed = - DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru); - } + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue CompressedMask = + DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive); - // If we changed the element type before, we need to convert it back. - if (ElmtVT.isFloatingPoint()) - Compressed = convertFromSVEContainerType(DL, Compressed, VecVT, DAG); - - // Extracting from a legal SVE type before truncating produces better code. - if (IsFixedLength) { - EVT FixedSubVector = VecVT.isInteger() - ? FixedVecVT.changeVectorElementType( - ContainerVT.getVectorElementType()) - : FixedVecVT; - Compressed = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedSubVector, - Compressed, DAG.getConstant(0, DL, MVT::i64)); - VecVT = FixedVecVT; - } - - if (VecVT.isInteger()) - Compressed = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed); - - return Compressed; + return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed, + Passthru); } // Generate SUBS and CSEL for integer abs. @@ -29332,10 +29260,6 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; - case ISD::VECTOR_COMPRESS: - if (SDValue Res = LowerVECTOR_COMPRESS(SDValue(N, 0), DAG)) - Results.push_back(Res); - return; case ISD::ADD: case ISD::FADD: ReplaceAddWithADDP(N, Results, DAG, Subtarget); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index fc6b15859507..2152d479ed61 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -800,6 +800,8 @@ private: SDValue LowerFixedLengthVECTOR_SHUFFLEToSVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFixedLengthBuildVectorToSVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFixedLengthVectorCompressToSVE(SDValue Op, + SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 00afcb886df2..9558cb516272 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7987,6 +7987,7 @@ multiclass sve_int_perm_compact_sd { def : SVE_2_Op_Pat(NAME # _S)>; def : SVE_2_Op_Pat(NAME # _D)>; def : SVE_2_Op_Pat(NAME # _D)>; + def : SVE_2_Op_Pat(NAME # _D)>; } multiclass sve_int_perm_compact_bh { @@ -9511,7 +9512,7 @@ multiclass sve_fp8_dot(NAME)>; } @@ -10997,7 +10998,7 @@ class sve2_fp8_cvt_single opc, string mnemonic, multiclass sve2_fp8_cvt_single opc, string mnemonic, ValueType vtd, SDPatternOperator op> { def _BtoH : sve2_fp8_cvt_single; - + def : SVE_1_Op_Pat(NAME # _BtoH)>; } @@ -11042,7 +11043,7 @@ class sve2_fp8_down_cvt_single_top opc, string mnemonic, RegisterOperand let Constraints = "$Zd = $_Zd"; let DestructiveInstType = DestructiveOther; let ElementSize = ZPR8.ElementSize; - + let Uses = [FPMR, FPCR]; let mayLoad = 1; let mayStore = 0; diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index cfd343e94baa..fc46460cef75 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -194,11 +194,9 @@ define <4 x i32> @test_compress_v4i32_with_sve(<4 x i32> %vec, <4 x i1> %mask) { ; CHECK-LABEL: test_compress_v4i32_with_sve: ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: shl v1.4s, v1.4s, #31 -; CHECK-NEXT: cmlt v1.4s, v1.4s, #0 -; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: compact z0.s, p0, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -212,14 +210,12 @@ define <1 x i32> @test_compress_v1i32_with_sve(<1 x i32> %vec, <1 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: sbfx w8, w0, #0, #1 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov v1.s[0], w8 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: compact z0.d, p0, z0.d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %out = call <1 x i32> @llvm.experimental.vector.compress(<1 x i32> %vec, <1 x i1> %mask, <1 x i32> poison) ret <1 x i32> %out @@ -231,28 +227,24 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: movi v5.2s, #1 +; CHECK-NEXT: movi v4.2s, #1 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ushll v3.2d, v2.2s, #0 -; CHECK-NEXT: ushll2 v4.2d, v2.4s, #0 -; CHECK-NEXT: and v2.8b, v2.8b, v5.8b +; CHECK-NEXT: ushll2 v5.2d, v2.4s, #0 +; CHECK-NEXT: and v2.8b, v2.8b, v4.8b ; CHECK-NEXT: shl v3.2d, v3.2d, #63 -; CHECK-NEXT: shl v4.2d, v4.2d, #63 +; CHECK-NEXT: shl v4.2d, v5.2d, #63 ; CHECK-NEXT: addp v2.2s, v2.2s, v2.2s -; CHECK-NEXT: cmlt v3.2d, v3.2d, #0 -; CHECK-NEXT: cmlt v4.2d, v4.2d, #0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: and z3.d, z3.d, #0x1 -; CHECK-NEXT: and z4.d, z4.d, #0x1 -; CHECK-NEXT: and x8, x8, #0x3 -; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z4.d, #0 +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: compact z0.d, p1, z0.d +; CHECK-NEXT: and x8, x8, #0x3 ; CHECK-NEXT: compact z1.d, p0, z1.d +; CHECK-NEXT: lsl x8, x8, #3 ; CHECK-NEXT: str q0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ldp q0, q1, [sp], #32 @@ -264,13 +256,12 @@ define <4 x double> @test_compress_v4f64_with_sve(<4 x double> %vec, <4 x i1> %m define <2 x i16> @test_compress_v2i16_with_sve(<2 x i16> %vec, <2 x i1> %mask) { ; CHECK-LABEL: test_compress_v2i16_with_sve: ; CHECK: // %bb.0: -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: and z1.d, z1.d, #0x1 -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: compact z0.d, p0, z0.d -; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: shl v1.2s, v1.2s, #31 +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret %out = call <2 x i16> @llvm.experimental.vector.compress(<2 x i16> %vec, <2 x i1> %mask, <2 x i16> poison) ret <2 x i16> %out