[AArch64] Improve lowering for scalable masked deinterleaving loads (#154338)

For IR like this:

%mask = ... @llvm.vector.interleave2(<vscale x 16 x i1> %a, <vscale x 16
x i1> %a)
  %vec = ... @llvm.masked.load(..., <vscale x 32 x i1> %mask, ...)
  %dvec = ... @llvm.vector.deinterleave2(<vscale x 32 x i8> %vec)

where we're deinterleaving a wide masked load of the supported type
and with an interleaved mask we can lower this directly to a ld2b
instruction. Similarly we can also support other variants of ld2
and ld4.

This PR adds a DAG combine to spot such patterns and lower to ld2X
or ld4X variants accordingly, whilst being careful to ensure the
masked load is only used by the deinterleave intrinsic.
This commit is contained in:
David Sherwood 2025-09-03 09:51:54 +01:00 committed by GitHub
parent 349523e26b
commit 73bed64433
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 871 additions and 0 deletions

View File

@ -3338,6 +3338,14 @@ namespace ISD {
return St && St->getAddressingMode() == ISD::UNINDEXED;
}
/// Returns true if the specified node is a non-extending and unindexed
/// masked load.
inline bool isNormalMaskedLoad(const SDNode *N) {
auto *Ld = dyn_cast<MaskedLoadSDNode>(N);
return Ld && Ld->getExtensionType() == ISD::NON_EXTLOAD &&
Ld->getAddressingMode() == ISD::UNINDEXED;
}
/// Attempt to match a unary predicate against a scalar/splat constant or
/// every element of a constant BUILD_VECTOR.
/// If AllowUndef is true, then UNDEF elements will pass nullptr to Match.

View File

@ -1179,6 +1179,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::VECTOR_DEINTERLEAVE);
// In case of strict alignment, avoid an excessive number of byte wide stores.
MaxStoresPerMemsetOptSize = 8;
@ -27207,6 +27208,115 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return NVCAST;
}
static SDValue performVectorDeinterleaveCombine(
SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
if (!DCI.isBeforeLegalize())
return SDValue();
unsigned NumParts = N->getNumOperands();
if (NumParts != 2 && NumParts != 4)
return SDValue();
EVT SubVecTy = N->getValueType(0);
// At the moment we're unlikely to see a fixed-width vector deinterleave as
// we usually generate shuffles instead.
unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
if (!SubVecTy.isScalableVector() ||
SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
!DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
return SDValue();
// Make sure each input operand is the correct extract_subvector of the same
// wider vector.
SDValue Op0 = N->getOperand(0);
for (unsigned I = 0; I < NumParts; I++) {
SDValue OpI = N->getOperand(I);
if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
OpI->getOperand(0) != Op0->getOperand(0))
return SDValue();
if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
return SDValue();
}
// Normal loads are currently already handled by the InterleavedAccessPass so
// we don't expect to see them here. Bail out if the masked load has an
// unexpected number of uses, since we want to avoid a situation where we have
// both deinterleaving loads and normal loads in the same block. Also, discard
// masked loads that are extending, indexed, have an unexpected offset or have
// an unsupported passthru value until we find a valid use case.
auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
!MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
!MaskedLoad->getOffset().isUndef() ||
(!MaskedLoad->getPassThru()->isUndef() &&
!isZerosVector(MaskedLoad->getPassThru().getNode())))
return SDValue();
// Now prove that the mask is an interleave of identical masks.
SDValue Mask = MaskedLoad->getMask();
if (Mask->getOpcode() != ISD::SPLAT_VECTOR &&
Mask->getOpcode() != ISD::CONCAT_VECTORS)
return SDValue();
SDValue NarrowMask;
SDLoc DL(N);
if (Mask->getOpcode() == ISD::CONCAT_VECTORS) {
if (Mask->getNumOperands() != NumParts)
return SDValue();
// We should be concatenating each sequential result from a
// VECTOR_INTERLEAVE.
SDNode *InterleaveOp = Mask->getOperand(0).getNode();
if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
InterleaveOp->getNumOperands() != NumParts)
return SDValue();
for (unsigned I = 0; I < NumParts; I++) {
if (Mask.getOperand(I) != SDValue(InterleaveOp, I))
return SDValue();
}
// Make sure the inputs to the vector interleave are identical.
if (!llvm::all_equal(InterleaveOp->op_values()))
return SDValue();
NarrowMask = InterleaveOp->getOperand(0);
} else { // ISD::SPLAT_VECTOR
ElementCount EC = Mask.getValueType().getVectorElementCount();
assert(EC.isKnownMultipleOf(NumParts) &&
"Expected element count divisible by number of parts");
EC = EC.divideCoefficientBy(NumParts);
NarrowMask =
DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
Mask->getOperand(0));
}
const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
: Intrinsic::aarch64_sve_ld4_sret;
SDValue NewLdOps[] = {MaskedLoad->getChain(),
DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
MaskedLoad->getBasePtr()};
SDValue Res;
if (NumParts == 2)
Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
{SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
else
Res = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
{SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
NewLdOps);
// We can now generate a structured load!
SmallVector<SDValue, 4> ResOps(NumParts);
for (unsigned Idx = 0; Idx < NumParts; Idx++)
ResOps[Idx] = SDValue(Res.getNode(), Idx);
// Replace uses of the original chain result with the new chain result.
DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
SDValue(Res.getNode(), NumParts));
return DCI.CombineTo(N, ResOps, false);
}
/// If the operand is a bitwise AND with a constant RHS, and the shift has a
/// constant RHS and is the only use, we can pull it out of the shift, i.e.
///
@ -27275,6 +27385,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
default:
LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
break;
case ISD::VECTOR_DEINTERLEAVE:
return performVectorDeinterleaveCombine(N, DCI, DAG);
case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:

View File

@ -0,0 +1,464 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
define { <16 x i8>, <16 x i8> } @foo_ld2_v16i8(<16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 v1.16b, v0.16b, v0.16b
; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: adrp x8, .LCPI0_0
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI0_0]
; CHECK-NEXT: shl v1.16b, v1.16b, #7
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v1.16b, v1.16b, #0
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b
; CHECK-NEXT: addv h1, v1.8h
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: fmov w9, s1
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: bfi w8, w9, #16, #16
; CHECK-NEXT: tbz w8, #0, .LBB0_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: ldr b1, [x0]
; CHECK-NEXT: tbnz w8, #1, .LBB0_3
; CHECK-NEXT: b .LBB0_4
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: tbz w8, #1, .LBB0_4
; CHECK-NEXT: .LBB0_3: // %cond.load1
; CHECK-NEXT: add x9, x0, #1
; CHECK-NEXT: ld1 { v1.b }[1], [x9]
; CHECK-NEXT: .LBB0_4: // %else2
; CHECK-NEXT: tbnz w8, #2, .LBB0_20
; CHECK-NEXT: // %bb.5: // %else5
; CHECK-NEXT: tbnz w8, #3, .LBB0_21
; CHECK-NEXT: .LBB0_6: // %else8
; CHECK-NEXT: tbnz w8, #4, .LBB0_22
; CHECK-NEXT: .LBB0_7: // %else11
; CHECK-NEXT: tbnz w8, #5, .LBB0_23
; CHECK-NEXT: .LBB0_8: // %else14
; CHECK-NEXT: tbnz w8, #6, .LBB0_24
; CHECK-NEXT: .LBB0_9: // %else17
; CHECK-NEXT: tbnz w8, #7, .LBB0_25
; CHECK-NEXT: .LBB0_10: // %else20
; CHECK-NEXT: tbnz w8, #8, .LBB0_26
; CHECK-NEXT: .LBB0_11: // %else23
; CHECK-NEXT: tbnz w8, #9, .LBB0_27
; CHECK-NEXT: .LBB0_12: // %else26
; CHECK-NEXT: tbnz w8, #10, .LBB0_28
; CHECK-NEXT: .LBB0_13: // %else29
; CHECK-NEXT: tbnz w8, #11, .LBB0_29
; CHECK-NEXT: .LBB0_14: // %else32
; CHECK-NEXT: tbnz w8, #12, .LBB0_30
; CHECK-NEXT: .LBB0_15: // %else35
; CHECK-NEXT: tbnz w8, #13, .LBB0_31
; CHECK-NEXT: .LBB0_16: // %else38
; CHECK-NEXT: tbnz w8, #14, .LBB0_32
; CHECK-NEXT: .LBB0_17: // %else41
; CHECK-NEXT: tbnz w8, #15, .LBB0_33
; CHECK-NEXT: .LBB0_18: // %else44
; CHECK-NEXT: tbz w8, #16, .LBB0_34
; CHECK-NEXT: .LBB0_19: // %cond.load46
; CHECK-NEXT: add x9, x0, #16
; CHECK-NEXT: ld1 { v2.b }[0], [x9]
; CHECK-NEXT: tbnz w8, #17, .LBB0_35
; CHECK-NEXT: b .LBB0_36
; CHECK-NEXT: .LBB0_20: // %cond.load4
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: ld1 { v1.b }[2], [x9]
; CHECK-NEXT: tbz w8, #3, .LBB0_6
; CHECK-NEXT: .LBB0_21: // %cond.load7
; CHECK-NEXT: add x9, x0, #3
; CHECK-NEXT: ld1 { v1.b }[3], [x9]
; CHECK-NEXT: tbz w8, #4, .LBB0_7
; CHECK-NEXT: .LBB0_22: // %cond.load10
; CHECK-NEXT: add x9, x0, #4
; CHECK-NEXT: ld1 { v1.b }[4], [x9]
; CHECK-NEXT: tbz w8, #5, .LBB0_8
; CHECK-NEXT: .LBB0_23: // %cond.load13
; CHECK-NEXT: add x9, x0, #5
; CHECK-NEXT: ld1 { v1.b }[5], [x9]
; CHECK-NEXT: tbz w8, #6, .LBB0_9
; CHECK-NEXT: .LBB0_24: // %cond.load16
; CHECK-NEXT: add x9, x0, #6
; CHECK-NEXT: ld1 { v1.b }[6], [x9]
; CHECK-NEXT: tbz w8, #7, .LBB0_10
; CHECK-NEXT: .LBB0_25: // %cond.load19
; CHECK-NEXT: add x9, x0, #7
; CHECK-NEXT: ld1 { v1.b }[7], [x9]
; CHECK-NEXT: tbz w8, #8, .LBB0_11
; CHECK-NEXT: .LBB0_26: // %cond.load22
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: ld1 { v1.b }[8], [x9]
; CHECK-NEXT: tbz w8, #9, .LBB0_12
; CHECK-NEXT: .LBB0_27: // %cond.load25
; CHECK-NEXT: add x9, x0, #9
; CHECK-NEXT: ld1 { v1.b }[9], [x9]
; CHECK-NEXT: tbz w8, #10, .LBB0_13
; CHECK-NEXT: .LBB0_28: // %cond.load28
; CHECK-NEXT: add x9, x0, #10
; CHECK-NEXT: ld1 { v1.b }[10], [x9]
; CHECK-NEXT: tbz w8, #11, .LBB0_14
; CHECK-NEXT: .LBB0_29: // %cond.load31
; CHECK-NEXT: add x9, x0, #11
; CHECK-NEXT: ld1 { v1.b }[11], [x9]
; CHECK-NEXT: tbz w8, #12, .LBB0_15
; CHECK-NEXT: .LBB0_30: // %cond.load34
; CHECK-NEXT: add x9, x0, #12
; CHECK-NEXT: ld1 { v1.b }[12], [x9]
; CHECK-NEXT: tbz w8, #13, .LBB0_16
; CHECK-NEXT: .LBB0_31: // %cond.load37
; CHECK-NEXT: add x9, x0, #13
; CHECK-NEXT: ld1 { v1.b }[13], [x9]
; CHECK-NEXT: tbz w8, #14, .LBB0_17
; CHECK-NEXT: .LBB0_32: // %cond.load40
; CHECK-NEXT: add x9, x0, #14
; CHECK-NEXT: ld1 { v1.b }[14], [x9]
; CHECK-NEXT: tbz w8, #15, .LBB0_18
; CHECK-NEXT: .LBB0_33: // %cond.load43
; CHECK-NEXT: add x9, x0, #15
; CHECK-NEXT: ld1 { v1.b }[15], [x9]
; CHECK-NEXT: tbnz w8, #16, .LBB0_19
; CHECK-NEXT: .LBB0_34:
; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: tbz w8, #17, .LBB0_36
; CHECK-NEXT: .LBB0_35: // %cond.load49
; CHECK-NEXT: add x9, x0, #17
; CHECK-NEXT: ld1 { v2.b }[1], [x9]
; CHECK-NEXT: .LBB0_36: // %else50
; CHECK-NEXT: tbnz w8, #18, .LBB0_52
; CHECK-NEXT: // %bb.37: // %else53
; CHECK-NEXT: tbnz w8, #19, .LBB0_53
; CHECK-NEXT: .LBB0_38: // %else56
; CHECK-NEXT: tbnz w8, #20, .LBB0_54
; CHECK-NEXT: .LBB0_39: // %else59
; CHECK-NEXT: tbnz w8, #21, .LBB0_55
; CHECK-NEXT: .LBB0_40: // %else62
; CHECK-NEXT: tbnz w8, #22, .LBB0_56
; CHECK-NEXT: .LBB0_41: // %else65
; CHECK-NEXT: tbnz w8, #23, .LBB0_57
; CHECK-NEXT: .LBB0_42: // %else68
; CHECK-NEXT: tbnz w8, #24, .LBB0_58
; CHECK-NEXT: .LBB0_43: // %else71
; CHECK-NEXT: tbnz w8, #25, .LBB0_59
; CHECK-NEXT: .LBB0_44: // %else74
; CHECK-NEXT: tbnz w8, #26, .LBB0_60
; CHECK-NEXT: .LBB0_45: // %else77
; CHECK-NEXT: tbnz w8, #27, .LBB0_61
; CHECK-NEXT: .LBB0_46: // %else80
; CHECK-NEXT: tbnz w8, #28, .LBB0_62
; CHECK-NEXT: .LBB0_47: // %else83
; CHECK-NEXT: tbnz w8, #29, .LBB0_63
; CHECK-NEXT: .LBB0_48: // %else86
; CHECK-NEXT: tbnz w8, #30, .LBB0_64
; CHECK-NEXT: .LBB0_49: // %else89
; CHECK-NEXT: tbz w8, #31, .LBB0_51
; CHECK-NEXT: .LBB0_50: // %cond.load91
; CHECK-NEXT: add x8, x0, #31
; CHECK-NEXT: ld1 { v2.b }[15], [x8]
; CHECK-NEXT: .LBB0_51: // %else92
; CHECK-NEXT: uzp1 v0.16b, v1.16b, v2.16b
; CHECK-NEXT: uzp2 v1.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB0_52: // %cond.load52
; CHECK-NEXT: add x9, x0, #18
; CHECK-NEXT: ld1 { v2.b }[2], [x9]
; CHECK-NEXT: tbz w8, #19, .LBB0_38
; CHECK-NEXT: .LBB0_53: // %cond.load55
; CHECK-NEXT: add x9, x0, #19
; CHECK-NEXT: ld1 { v2.b }[3], [x9]
; CHECK-NEXT: tbz w8, #20, .LBB0_39
; CHECK-NEXT: .LBB0_54: // %cond.load58
; CHECK-NEXT: add x9, x0, #20
; CHECK-NEXT: ld1 { v2.b }[4], [x9]
; CHECK-NEXT: tbz w8, #21, .LBB0_40
; CHECK-NEXT: .LBB0_55: // %cond.load61
; CHECK-NEXT: add x9, x0, #21
; CHECK-NEXT: ld1 { v2.b }[5], [x9]
; CHECK-NEXT: tbz w8, #22, .LBB0_41
; CHECK-NEXT: .LBB0_56: // %cond.load64
; CHECK-NEXT: add x9, x0, #22
; CHECK-NEXT: ld1 { v2.b }[6], [x9]
; CHECK-NEXT: tbz w8, #23, .LBB0_42
; CHECK-NEXT: .LBB0_57: // %cond.load67
; CHECK-NEXT: add x9, x0, #23
; CHECK-NEXT: ld1 { v2.b }[7], [x9]
; CHECK-NEXT: tbz w8, #24, .LBB0_43
; CHECK-NEXT: .LBB0_58: // %cond.load70
; CHECK-NEXT: add x9, x0, #24
; CHECK-NEXT: ld1 { v2.b }[8], [x9]
; CHECK-NEXT: tbz w8, #25, .LBB0_44
; CHECK-NEXT: .LBB0_59: // %cond.load73
; CHECK-NEXT: add x9, x0, #25
; CHECK-NEXT: ld1 { v2.b }[9], [x9]
; CHECK-NEXT: tbz w8, #26, .LBB0_45
; CHECK-NEXT: .LBB0_60: // %cond.load76
; CHECK-NEXT: add x9, x0, #26
; CHECK-NEXT: ld1 { v2.b }[10], [x9]
; CHECK-NEXT: tbz w8, #27, .LBB0_46
; CHECK-NEXT: .LBB0_61: // %cond.load79
; CHECK-NEXT: add x9, x0, #27
; CHECK-NEXT: ld1 { v2.b }[11], [x9]
; CHECK-NEXT: tbz w8, #28, .LBB0_47
; CHECK-NEXT: .LBB0_62: // %cond.load82
; CHECK-NEXT: add x9, x0, #28
; CHECK-NEXT: ld1 { v2.b }[12], [x9]
; CHECK-NEXT: tbz w8, #29, .LBB0_48
; CHECK-NEXT: .LBB0_63: // %cond.load85
; CHECK-NEXT: add x9, x0, #29
; CHECK-NEXT: ld1 { v2.b }[13], [x9]
; CHECK-NEXT: tbz w8, #30, .LBB0_49
; CHECK-NEXT: .LBB0_64: // %cond.load88
; CHECK-NEXT: add x9, x0, #30
; CHECK-NEXT: ld1 { v2.b }[14], [x9]
; CHECK-NEXT: tbnz w8, #31, .LBB0_50
; CHECK-NEXT: b .LBB0_51
%interleaved.mask = call <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1> %mask, <16 x i1> %mask)
%wide.masked.vec = call <32 x i8> @llvm.masked.load.v32i8.p0(ptr %p, i32 1, <32 x i1> %interleaved.mask, <32 x i8> poison)
%deinterleaved.vec = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %wide.masked.vec)
ret { <16 x i8>, <16 x i8> } %deinterleaved.vec
}
define { <8 x i16>, <8 x i16> } @foo_ld2_v8i16(<8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: adrp x8, .LCPI1_0
; CHECK-NEXT: zip1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI1_0]
; CHECK-NEXT: shl v0.16b, v0.16b, #7
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
; CHECK-NEXT: addv h0, v0.8h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tbz w8, #0, .LBB1_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: ldr h1, [x0]
; CHECK-NEXT: tbnz w8, #1, .LBB1_3
; CHECK-NEXT: b .LBB1_4
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: tbz w8, #1, .LBB1_4
; CHECK-NEXT: .LBB1_3: // %cond.load1
; CHECK-NEXT: add x9, x0, #2
; CHECK-NEXT: ld1 { v1.h }[1], [x9]
; CHECK-NEXT: .LBB1_4: // %else2
; CHECK-NEXT: tbnz w8, #2, .LBB1_12
; CHECK-NEXT: // %bb.5: // %else5
; CHECK-NEXT: tbnz w8, #3, .LBB1_13
; CHECK-NEXT: .LBB1_6: // %else8
; CHECK-NEXT: tbnz w8, #4, .LBB1_14
; CHECK-NEXT: .LBB1_7: // %else11
; CHECK-NEXT: tbnz w8, #5, .LBB1_15
; CHECK-NEXT: .LBB1_8: // %else14
; CHECK-NEXT: tbnz w8, #6, .LBB1_16
; CHECK-NEXT: .LBB1_9: // %else17
; CHECK-NEXT: tbnz w8, #7, .LBB1_17
; CHECK-NEXT: .LBB1_10: // %else20
; CHECK-NEXT: tbz w8, #8, .LBB1_18
; CHECK-NEXT: .LBB1_11: // %cond.load22
; CHECK-NEXT: add x9, x0, #16
; CHECK-NEXT: ld1 { v2.h }[0], [x9]
; CHECK-NEXT: tbnz w8, #9, .LBB1_19
; CHECK-NEXT: b .LBB1_20
; CHECK-NEXT: .LBB1_12: // %cond.load4
; CHECK-NEXT: add x9, x0, #4
; CHECK-NEXT: ld1 { v1.h }[2], [x9]
; CHECK-NEXT: tbz w8, #3, .LBB1_6
; CHECK-NEXT: .LBB1_13: // %cond.load7
; CHECK-NEXT: add x9, x0, #6
; CHECK-NEXT: ld1 { v1.h }[3], [x9]
; CHECK-NEXT: tbz w8, #4, .LBB1_7
; CHECK-NEXT: .LBB1_14: // %cond.load10
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: ld1 { v1.h }[4], [x9]
; CHECK-NEXT: tbz w8, #5, .LBB1_8
; CHECK-NEXT: .LBB1_15: // %cond.load13
; CHECK-NEXT: add x9, x0, #10
; CHECK-NEXT: ld1 { v1.h }[5], [x9]
; CHECK-NEXT: tbz w8, #6, .LBB1_9
; CHECK-NEXT: .LBB1_16: // %cond.load16
; CHECK-NEXT: add x9, x0, #12
; CHECK-NEXT: ld1 { v1.h }[6], [x9]
; CHECK-NEXT: tbz w8, #7, .LBB1_10
; CHECK-NEXT: .LBB1_17: // %cond.load19
; CHECK-NEXT: add x9, x0, #14
; CHECK-NEXT: ld1 { v1.h }[7], [x9]
; CHECK-NEXT: tbnz w8, #8, .LBB1_11
; CHECK-NEXT: .LBB1_18:
; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: tbz w8, #9, .LBB1_20
; CHECK-NEXT: .LBB1_19: // %cond.load25
; CHECK-NEXT: add x9, x0, #18
; CHECK-NEXT: ld1 { v2.h }[1], [x9]
; CHECK-NEXT: .LBB1_20: // %else26
; CHECK-NEXT: tbnz w8, #10, .LBB1_28
; CHECK-NEXT: // %bb.21: // %else29
; CHECK-NEXT: tbnz w8, #11, .LBB1_29
; CHECK-NEXT: .LBB1_22: // %else32
; CHECK-NEXT: tbnz w8, #12, .LBB1_30
; CHECK-NEXT: .LBB1_23: // %else35
; CHECK-NEXT: tbnz w8, #13, .LBB1_31
; CHECK-NEXT: .LBB1_24: // %else38
; CHECK-NEXT: tbnz w8, #14, .LBB1_32
; CHECK-NEXT: .LBB1_25: // %else41
; CHECK-NEXT: tbz w8, #15, .LBB1_27
; CHECK-NEXT: .LBB1_26: // %cond.load43
; CHECK-NEXT: add x8, x0, #30
; CHECK-NEXT: ld1 { v2.h }[7], [x8]
; CHECK-NEXT: .LBB1_27: // %else44
; CHECK-NEXT: uzp1 v0.8h, v1.8h, v2.8h
; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB1_28: // %cond.load28
; CHECK-NEXT: add x9, x0, #20
; CHECK-NEXT: ld1 { v2.h }[2], [x9]
; CHECK-NEXT: tbz w8, #11, .LBB1_22
; CHECK-NEXT: .LBB1_29: // %cond.load31
; CHECK-NEXT: add x9, x0, #22
; CHECK-NEXT: ld1 { v2.h }[3], [x9]
; CHECK-NEXT: tbz w8, #12, .LBB1_23
; CHECK-NEXT: .LBB1_30: // %cond.load34
; CHECK-NEXT: add x9, x0, #24
; CHECK-NEXT: ld1 { v2.h }[4], [x9]
; CHECK-NEXT: tbz w8, #13, .LBB1_24
; CHECK-NEXT: .LBB1_31: // %cond.load37
; CHECK-NEXT: add x9, x0, #26
; CHECK-NEXT: ld1 { v2.h }[5], [x9]
; CHECK-NEXT: tbz w8, #14, .LBB1_25
; CHECK-NEXT: .LBB1_32: // %cond.load40
; CHECK-NEXT: add x9, x0, #28
; CHECK-NEXT: ld1 { v2.h }[6], [x9]
; CHECK-NEXT: tbnz w8, #15, .LBB1_26
; CHECK-NEXT: b .LBB1_27
%interleaved.mask = call <16 x i1> @llvm.vector.interleave2.v16i1(<8 x i1> %mask, <8 x i1> %mask)
%wide.masked.vec = call <16 x i16> @llvm.masked.load.v16i16.p0(ptr %p, i32 2, <16 x i1> %interleaved.mask, <16 x i16> poison)
%deinterleaved.vec = call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %wide.masked.vec)
ret { <8 x i16>, <8 x i16> } %deinterleaved.vec
}
define { <4 x float>, <4 x float> } @foo_ld2_v4f32(<4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_v4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: adrp x8, .LCPI2_0
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI2_0]
; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b
; CHECK-NEXT: shl v0.8b, v0.8b, #7
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: addv b0, v0.8b
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tbz w8, #0, .LBB2_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: ldr s1, [x0]
; CHECK-NEXT: tbnz w8, #1, .LBB2_3
; CHECK-NEXT: b .LBB2_4
; CHECK-NEXT: .LBB2_2:
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: tbz w8, #1, .LBB2_4
; CHECK-NEXT: .LBB2_3: // %cond.load1
; CHECK-NEXT: add x9, x0, #4
; CHECK-NEXT: ld1 { v1.s }[1], [x9]
; CHECK-NEXT: .LBB2_4: // %else2
; CHECK-NEXT: tbnz w8, #2, .LBB2_8
; CHECK-NEXT: // %bb.5: // %else5
; CHECK-NEXT: tbnz w8, #3, .LBB2_9
; CHECK-NEXT: .LBB2_6: // %else8
; CHECK-NEXT: tbz w8, #4, .LBB2_10
; CHECK-NEXT: .LBB2_7: // %cond.load10
; CHECK-NEXT: add x9, x0, #16
; CHECK-NEXT: ld1 { v2.s }[0], [x9]
; CHECK-NEXT: tbnz w8, #5, .LBB2_11
; CHECK-NEXT: b .LBB2_12
; CHECK-NEXT: .LBB2_8: // %cond.load4
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: ld1 { v1.s }[2], [x9]
; CHECK-NEXT: tbz w8, #3, .LBB2_6
; CHECK-NEXT: .LBB2_9: // %cond.load7
; CHECK-NEXT: add x9, x0, #12
; CHECK-NEXT: ld1 { v1.s }[3], [x9]
; CHECK-NEXT: tbnz w8, #4, .LBB2_7
; CHECK-NEXT: .LBB2_10:
; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: tbz w8, #5, .LBB2_12
; CHECK-NEXT: .LBB2_11: // %cond.load13
; CHECK-NEXT: add x9, x0, #20
; CHECK-NEXT: ld1 { v2.s }[1], [x9]
; CHECK-NEXT: .LBB2_12: // %else14
; CHECK-NEXT: tbnz w8, #6, .LBB2_16
; CHECK-NEXT: // %bb.13: // %else17
; CHECK-NEXT: tbz w8, #7, .LBB2_15
; CHECK-NEXT: .LBB2_14: // %cond.load19
; CHECK-NEXT: add x8, x0, #28
; CHECK-NEXT: ld1 { v2.s }[3], [x8]
; CHECK-NEXT: .LBB2_15: // %else20
; CHECK-NEXT: uzp1 v0.4s, v1.4s, v2.4s
; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB2_16: // %cond.load16
; CHECK-NEXT: add x9, x0, #24
; CHECK-NEXT: ld1 { v2.s }[2], [x9]
; CHECK-NEXT: tbnz w8, #7, .LBB2_14
; CHECK-NEXT: b .LBB2_15
%interleaved.mask = call <8 x i1> @llvm.vector.interleave2.v8i1(<4 x i1> %mask, <4 x i1> %mask)
%wide.masked.vec = call <8 x float> @llvm.masked.load.v8f32.p0(ptr %p, i32 4, <8 x i1> %interleaved.mask, <8 x float> poison)
%deinterleaved.vec = call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v16f32(<8 x float> %wide.masked.vec)
ret { <4 x float>, <4 x float> } %deinterleaved.vec
}
define { <2 x double>, <2 x double> } @foo_ld2_v2f64(<2 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: adrp x8, .LCPI3_0
; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI3_0]
; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h
; CHECK-NEXT: shl v0.4h, v0.4h, #15
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-NEXT: addv h0, v0.4h
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: tbz w8, #0, .LBB3_2
; CHECK-NEXT: // %bb.1: // %cond.load
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: tbnz w8, #1, .LBB3_3
; CHECK-NEXT: b .LBB3_4
; CHECK-NEXT: .LBB3_2:
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: tbz w8, #1, .LBB3_4
; CHECK-NEXT: .LBB3_3: // %cond.load1
; CHECK-NEXT: add x9, x0, #8
; CHECK-NEXT: ld1 { v1.d }[1], [x9]
; CHECK-NEXT: .LBB3_4: // %else2
; CHECK-NEXT: tbz w8, #2, .LBB3_6
; CHECK-NEXT: // %bb.5: // %cond.load4
; CHECK-NEXT: add x9, x0, #16
; CHECK-NEXT: ld1 { v2.d }[0], [x9]
; CHECK-NEXT: tbnz w8, #3, .LBB3_7
; CHECK-NEXT: b .LBB3_8
; CHECK-NEXT: .LBB3_6:
; CHECK-NEXT: // implicit-def: $q2
; CHECK-NEXT: tbz w8, #3, .LBB3_8
; CHECK-NEXT: .LBB3_7: // %cond.load7
; CHECK-NEXT: add x8, x0, #24
; CHECK-NEXT: ld1 { v2.d }[1], [x8]
; CHECK-NEXT: .LBB3_8: // %else8
; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d
; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d
; CHECK-NEXT: ret
%interleaved.mask = call <4 x i1> @llvm.vector.interleave2.v4i1(<2 x i1> %mask, <2 x i1> %mask)
%wide.masked.vec = call <4 x double> @llvm.masked.load.v4f64.p0(ptr %p, i32 8, <4 x i1> %interleaved.mask, <4 x double> poison)
%deinterleaved.vec = call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %wide.masked.vec)
ret { <2 x double>, <2 x double> } %deinterleaved.vec
}

View File

@ -0,0 +1,287 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld2_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld2h { z0.h, z1.h }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%wide.masked.vec = call <vscale x 16 x i16> @llvm.masked.load.nxv16i16.p0(ptr %p, i32 2, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i16> poison)
%deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %wide.masked.vec)
ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec
}
define { <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld2_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
%wide.masked.vec = call <vscale x 8 x float> @llvm.masked.load.nxv8f32(ptr %p, i32 4, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x float> poison)
%deinterleaved.vec = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %wide.masked.vec)
ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleaved.vec
}
define { <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld2_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ld2d { z0.d, z1.d }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 4 x i1> @llvm.vector.interleave2.nxv4i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
%wide.masked.vec = call <vscale x 4 x double> @llvm.masked.load.nxv4f64(ptr %p, i32 8, <vscale x 4 x i1> %interleaved.mask, <vscale x 4 x double> poison)
%deinterleaved.vec = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %wide.masked.vec)
ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleaved.vec
}
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8(<vscale x 16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 1, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_ld4_nxv8i16(<vscale x 8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ld4h { z0.h - z3.h }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave4.nxv32i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%wide.masked.vec = call <vscale x 32 x i16> @llvm.masked.load.nxv32i16(ptr %p, i32 2, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i16> poison)
%deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %wide.masked.vec)
ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec
}
define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @foo_ld4_nxv4f32(<vscale x 4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
%wide.masked.vec = call <vscale x 16 x float> @llvm.masked.load.nxv16f32(ptr %p, i32 4, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x float> poison)
%deinterleaved.vec = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> %wide.masked.vec)
ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %deinterleaved.vec
}
define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @foo_ld4_nxv2f64(<vscale x 2 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld4_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: ld4d { z0.d - z3.d }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave4.nxv8i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask, <vscale x 2 x i1> %mask)
%wide.masked.vec = call <vscale x 8 x double> @llvm.masked.load.nxv8f64(ptr %p, i32 8, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x double> poison)
%deinterleaved.vec = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %wide.masked.vec)
ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %deinterleaved.vec
}
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mul_use_of_mask(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld4_nxv16i8_mul_use_of_mask:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 p2.b, p0.b, p0.b
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
; CHECK-NEXT: zip2 p1.b, p0.b, p0.b
; CHECK-NEXT: zip1 p3.b, p2.b, p2.b
; CHECK-NEXT: zip2 p0.b, p1.b, p1.b
; CHECK-NEXT: zip1 p1.b, p1.b, p1.b
; CHECK-NEXT: zip2 p2.b, p2.b, p2.b
; CHECK-NEXT: // fake_use: $p3
; CHECK-NEXT: // fake_use: $p2
; CHECK-NEXT: // fake_use: $p1
; CHECK-NEXT: // fake_use: $p0
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
call void (...) @llvm.fake.use(<vscale x 64 x i1> %interleaved.mask)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_interleaved_ones(ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_interleaved_ones:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1), <vscale x 16 x i1> splat(i1 1))
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_mask_of_ones(ptr %p) {
; CHECK-LABEL: foo_ld4_nxv16i8_mask_of_ones:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.b
; CHECK-NEXT: ld4b { z0.b - z3.b }, p0/z, [x0]
; CHECK-NEXT: ret
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> splat(i1 1), <vscale x 64 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
; Negative tests
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_mul_use_of_load(<vscale x 16 x i1> %mask, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld2_nxv16i8_mul_use_of_load:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
; CHECK-NEXT: ld1b { z3.b }, p1/z, [x0]
; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: uzp1 z0.b, z3.b, z2.b
; CHECK-NEXT: uzp2 z1.b, z3.b, z2.b
; CHECK-NEXT: // fake_use: $z3
; CHECK-NEXT: // fake_use: $z2
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
call void (...) @llvm.fake.use(<vscale x 32 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
; Mask must be an interleave of identical masks.
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_mask(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 p2.b, p0.b, p1.b
; CHECK-NEXT: zip2 p0.b, p0.b, p1.b
; CHECK-NEXT: ld1b { z2.b }, p2/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b
; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask2)
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
; Number of parts in mask interleave must match deinterleave.
define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld4_nxv16i8_bad_mask2(<vscale x 32 x i1> %mask, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld4_nxv16i8_bad_mask2:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 p2.b, p1.b, p1.b
; CHECK-NEXT: zip2 p1.b, p1.b, p1.b
; CHECK-NEXT: zip2 p3.b, p0.b, p0.b
; CHECK-NEXT: ld1b { z3.b }, p2/z, [x0, #2, mul vl]
; CHECK-NEXT: zip1 p0.b, p0.b, p0.b
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, #3, mul vl]
; CHECK-NEXT: ld1b { z0.b }, p3/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0]
; CHECK-NEXT: uzp1 z4.b, z3.b, z2.b
; CHECK-NEXT: uzp2 z3.b, z3.b, z2.b
; CHECK-NEXT: uzp1 z5.b, z1.b, z0.b
; CHECK-NEXT: uzp2 z6.b, z1.b, z0.b
; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 64 x i1> @llvm.vector.interleave2.nxv64i1(<vscale x 32 x i1> %mask, <vscale x 32 x i1> %mask)
%wide.masked.vec = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8(ptr %p, i32 4, <vscale x 64 x i1> %interleaved.mask, <vscale x 64 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
; Mask must come from an interleave or a splat.
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_mask3(<vscale x 32 x i1> %mask, ptr %p, ptr %p2) {
; CHECK-LABEL: foo_ld2_nxv16i8_bad_mask3:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0, #1, mul vl]
; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0]
; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b
; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b
; CHECK-NEXT: ret
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i8> poison)
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
; Each deinterleaved vector must be exactly 128 bits.
define { <vscale x 8 x i8>, <vscale x 8 x i8> } @foo_ld2_nxv8i8(<vscale x 8 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv8i8:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 p1.h, p0.h, p0.h
; CHECK-NEXT: zip1 p0.h, p0.h, p0.h
; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b
; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
; CHECK-NEXT: uunpkhi z1.h, z0.b
; CHECK-NEXT: uunpklo z2.h, z0.b
; CHECK-NEXT: uzp1 z0.h, z2.h, z1.h
; CHECK-NEXT: uzp2 z1.h, z2.h, z1.h
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 16 x i1> @llvm.vector.interleave2.nxv16i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %mask)
%wide.masked.vec = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %p, i32 1, <vscale x 16 x i1> %interleaved.mask, <vscale x 16 x i8> poison)
%deinterleaved.vec = call { <vscale x 8 x i8>, <vscale x 8 x i8> } @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8> %wide.masked.vec)
ret { <vscale x 8 x i8>, <vscale x 8 x i8> } %deinterleaved.vec
}
; Passthru must be poison or zero.
define { <vscale x 16 x i8>, <vscale x 16 x i8> } @foo_ld2_nxv16i8_bad_passthru(<vscale x 16 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv16i8_bad_passthru:
; CHECK: // %bb.0:
; CHECK-NEXT: zip1 p1.b, p0.b, p0.b
; CHECK-NEXT: mov z0.b, #3 // =0x3
; CHECK-NEXT: zip2 p0.b, p0.b, p0.b
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0]
; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, #1, mul vl]
; CHECK-NEXT: sel z2.b, p1, z2.b, z0.b
; CHECK-NEXT: sel z1.b, p0, z1.b, z0.b
; CHECK-NEXT: uzp1 z0.b, z2.b, z1.b
; CHECK-NEXT: uzp2 z1.b, z2.b, z1.b
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> %mask, <vscale x 16 x i1> %mask)
%wide.masked.vec = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8(ptr %p, i32 1, <vscale x 32 x i1> %interleaved.mask, <vscale x 32 x i8> splat(i8 3))
%deinterleaved.vec = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %wide.masked.vec)
ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleaved.vec
}
define { <vscale x 8 x i16>, <vscale x 8 x i16> } @foo_deinterleave2_not_load(<vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) {
; CHECK-LABEL: foo_deinterleave2_not_load:
; CHECK: // %bb.0:
; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h
; CHECK-NEXT: uzp2 z1.h, z0.h, z1.h
; CHECK-NEXT: mov z0.d, z2.d
; CHECK-NEXT: ret
%bad.vec.init = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> %vec1, i64 0)
%bad.vec = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16(<vscale x 16 x i16> %bad.vec.init, <vscale x 8 x i16> %vec2, i64 8)
%deinterleaved.vec = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %bad.vec)
ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleaved.vec
}
define { <vscale x 4 x i16>, <vscale x 4 x i16> } @foo_ld2_nxv8i8_exti16(<vscale x 4 x i1> %mask, ptr %p) {
; CHECK-LABEL: foo_ld2_nxv8i8_exti16:
; CHECK: // %bb.0:
; CHECK-NEXT: zip2 p1.s, p0.s, p0.s
; CHECK-NEXT: zip1 p0.s, p0.s, p0.s
; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z2.s, z0.h
; CHECK-NEXT: uzp1 z0.s, z2.s, z1.s
; CHECK-NEXT: uzp2 z1.s, z2.s, z1.s
; CHECK-NEXT: ret
%interleaved.mask = call <vscale x 8 x i1> @llvm.vector.interleave2.nxv8i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %mask)
%wide.masked.vec = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %p, i32 1, <vscale x 8 x i1> %interleaved.mask, <vscale x 8 x i8> poison)
%wide.masked.vec.ext = zext <vscale x 8 x i8> %wide.masked.vec to <vscale x 8 x i16>
%deinterleaved.vec = call { <vscale x 4 x i16>, <vscale x 4 x i16> } @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16> %wide.masked.vec.ext)
ret { <vscale x 4 x i16>, <vscale x 4 x i16> } %deinterleaved.vec
}