From 8b3a124ad87d1e808852644090ea5d1117fe2f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 6 Nov 2025 11:00:45 +0200 Subject: [PATCH] Revert "[InterleavedAccess] Construct interleaved access store with shuffles" This reverts commit 78d649199b47370b72848c1ca8d9bd3323b050ac. That commit caused failed asserts, see https://github.com/llvm/llvm-project/pull/164000 for details. --- llvm/include/llvm/CodeGen/TargetLowering.h | 5 - llvm/lib/CodeGen/InterleavedAccessPass.cpp | 13 +- .../Target/AArch64/AArch64ISelLowering.cpp | 131 +----------------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 7 - .../AArch64/AArch64TargetTransformInfo.cpp | 38 +---- llvm/test/CodeGen/AArch64/vldn_shuffle.ll | 105 -------------- .../AArch64/interleaved_store.ll | 117 ---------------- .../AArch64/replicating-load-store-costs.ll | 2 +- .../PhaseOrdering/AArch64/interleave_vec.ll | 16 +-- 9 files changed, 18 insertions(+), 416 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8aeaa9cdacfc..2550c2bee5f7 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3233,11 +3233,6 @@ public: /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } - /// Return true if the target interleave with shuffles are cheaper - virtual bool isProfitableToInterleaveWithGatherScatter() const { - return false; - } - /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 45eca28ffb8a..5c27a20869f8 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -239,8 +239,7 @@ static bool isDeInterleaveMask(ArrayRef Mask, unsigned &Factor, /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, - unsigned MaxFactor, - bool InterleaveWithShuffles) { + unsigned MaxFactor) { unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; @@ -251,13 +250,6 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return true; } - if (InterleaveWithShuffles) { - for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) { - Factor = i * MaxFactor; - if (SVI->isInterleave(Factor)) - return true; - } - } return false; } @@ -536,8 +528,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore( cast(SVI->getType())->getNumElements(); // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - if (!isReInterleaveMask(SVI, Factor, MaxFactor, - TLI->isProfitableToInterleaveWithGatherScatter())) + if (!isReInterleaveMask(SVI, Factor, MaxFactor)) return false; assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 298746863d22..d08f9b94227a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -96,7 +96,6 @@ #include #include #include -#include #include #include #include @@ -17990,17 +17989,11 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, unsigned Factor, const APInt &GapMask) const { + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); auto *SI = dyn_cast(Store); if (!SI) return false; - - if (isProfitableToInterleaveWithGatherScatter() && - Factor > getMaxSupportedInterleaveFactor()) - return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); - - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && - "Invalid interleave factor"); - assert(!LaneMask && GapMask.popcount() == Factor && "Unexpected mask on store"); @@ -18146,126 +18139,6 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, return true; } -/// If the interleaved vector elements are greater than supported MaxFactor, -/// interleaving the data with additional shuffles can be used to -/// achieve the same. -/// -/// Consider the following data with 8 interleaves which are shuffled to store -/// stN instructions. Data needs to be stored in this order: -/// [v0, v1, v2, v3, v4, v5, v6, v7] -/// -/// v0 v4 v2 v6 v1 v5 v3 v7 -/// | | | | | | | | -/// \ / \ / \ / \ / -/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 -/// | | | | -/// \ / \ / -/// \ / \ / -/// \ / \ / -/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 -/// -/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored -/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with -/// another st4. -/// -/// For stN = 2, upper half of interleaved data V0, V1 is stored -/// with one st2 instruction. Second set V2, V3 is stored with another st2. -/// Total of 4 st2's are required here. -bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( - StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { - unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); - - auto *VecTy = cast(SVI->getType()); - assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); - - unsigned LaneLen = VecTy->getNumElements() / Factor; - Type *EltTy = VecTy->getElementType(); - auto *SubVecTy = FixedVectorType::get(EltTy, Factor); - - const DataLayout &DL = SI->getModule()->getDataLayout(); - bool UseScalable; - - // Skip if we do not have NEON and skip illegal vector types. We can - // "legalize" wide vector types into multiple interleaved accesses as long as - // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || - !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return false; - - if (UseScalable) - return false; - - std::deque Shuffles; - Shuffles.push_back(SVI); - unsigned ConcatLevel = Factor; - // Getting all the interleaved operands. - while (ConcatLevel > 1) { - unsigned InterleavedOperands = Shuffles.size(); - for (unsigned i = 0; i < InterleavedOperands; i++) { - ShuffleVectorInst *SFL = dyn_cast(Shuffles.front()); - if (!SFL) - return false; - Shuffles.pop_front(); - - Value *Op0 = SFL->getOperand(0); - Value *Op1 = SFL->getOperand(1); - - Shuffles.push_back(dyn_cast(Op0)); - Shuffles.push_back(dyn_cast(Op1)); - } - ConcatLevel >>= 1; - } - - IRBuilder<> Builder(SI); - auto Mask = createInterleaveMask(LaneLen, 2); - SmallVector UpperHalfMask(LaneLen), LowerHalfMask(LaneLen); - for (unsigned i = 0; i < LaneLen; i++) { - LowerHalfMask[i] = Mask[i]; - UpperHalfMask[i] = Mask[i + LaneLen]; - } - - unsigned InterleaveFactor = Factor >> 1; - while (InterleaveFactor >= MaxSupportedFactor) { - std::deque ShufflesIntermediate; - ShufflesIntermediate.resize(Factor); - for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) { - for (unsigned i = 0; i < InterleaveFactor; i++) { - auto *Shuffle = Builder.CreateShuffleVector( - Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask); - ShufflesIntermediate[i + j] = Shuffle; - Shuffle = Builder.CreateShuffleVector( - Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask); - ShufflesIntermediate[i + j + InterleaveFactor] = Shuffle; - } - } - Shuffles = ShufflesIntermediate; - InterleaveFactor >>= 1; - } - - Type *PtrTy = SI->getPointerOperandType(); - auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); - - Value *BaseAddr = SI->getPointerOperand(); - Function *StNFunc = getStructuredStoreFunction( - SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); - for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) { - SmallVector Ops; - for (unsigned j = 0; j < MaxSupportedFactor; j++) - Ops.push_back(Shuffles[i * MaxSupportedFactor + j]); - - if (i > 0) { - // We will compute the pointer operand of each store from the original - // base address using GEPs. Cast the base address to a pointer to the - // scalar element type. - BaseAddr = Builder.CreateConstGEP1_32( - SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); - } - Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); - Builder.CreateCall(StNFunc, Ops); - } - return true; -} - bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index bfd8474bfeec..70bfae717fb7 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,10 +229,6 @@ public: bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override; - bool isProfitableToInterleaveWithGatherScatter() const override { - return true; - } - unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(Instruction *Load, Value *Mask, @@ -243,9 +239,6 @@ public: ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override; - bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI, - unsigned Factor) const; - bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 8729ed389013..197aae6e03cb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4922,36 +4922,11 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - unsigned NumLoadStores = 1; - InstructionCost ShuffleCost = 0; - bool isInterleaveWithShuffle = false; - unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor(); - - auto *SubVecTy = - VectorType::get(VecVTy->getElementType(), - VecVTy->getElementCount().divideCoefficientBy(Factor)); - - if (TLI->isProfitableToInterleaveWithGatherScatter() && - Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) && - Factor > MaxSupportedFactor) { - isInterleaveWithShuffle = true; - SmallVector Mask; - // preparing interleave Mask. - for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2; - i++) { - for (unsigned j = 0; j < 2; j++) - Mask.push_back(j * Factor + i); - } - - NumLoadStores = Factor / MaxSupportedFactor; - ShuffleCost = - (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy, - Mask, CostKind, 0, SubVecTy)); - } - - if (!UseMaskForGaps && - (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) { + if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); + auto *SubVecTy = + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -4959,10 +4934,7 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( bool UseScalable; if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return (Factor * - TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) * - NumLoadStores) + - ShuffleCost; + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index b2635d3d9f1a..3685e9cf85bd 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -730,111 +730,6 @@ entry: ret void } -define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, - <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) { -; CHECK-LABEL: store_factor8: -; CHECK: .Lfunc_begin17: -; CHECK-NEXT: .cfi_startproc -; CHECK-NEXT: // %bb.0: -; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]] -; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]] -; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]] -; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]] -; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]] -; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]] -; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]] -; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]] -; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64 -; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0] -; CHECK-NEXT: ret - - %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> - %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> - %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> - %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> - - %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> - %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> - - %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> - store <32 x i32> %interleaved.vec, ptr %ptr, align 4 - ret void -} - -define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, - <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7, - <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11, - <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) { -; CHECK-LABEL: store_factor16: -; CHECK: .Lfunc_begin18: -; CHECK-NEXT: .cfi_startproc -; CHECK-NEXT: // %bb.0: -; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]] -; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]] -; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]] -; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]] -; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]] -; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]] -; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]] -; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]] -; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]] -; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]] -; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]] -; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]] -; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]] -; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]] -; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]] -; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]] -; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]] -; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]] -; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]] -; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]] -; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]] -; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]] -; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]] -; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]] -; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]] -; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]] -; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]] -; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]] -; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]] -; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]] -; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]] -; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]] -; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64 -; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8] -; CHECK-NEXT: add x8, x0, #128 -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8] -; CHECK-NEXT: add x8, x0, #192 -; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8] -; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload -; CHECK-NEXT: ret - - %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> - %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> - %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> - %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> - %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> - %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> - %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> - %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> - - %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> - %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> - %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> - %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> - - %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> - %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> - - %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> - store <64 x i32> %interleaved.vec, ptr %ptr, align 4 - ret void -} - declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll deleted file mode 100644 index bd5f4e2a3279..000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll +++ /dev/null @@ -1,117 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16 -S < %s | FileCheck %s - -define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) { -; CHECK-LABEL: define dso_local void @_Z6unpackPhS_( -; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) { -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]] -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> -; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]] -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]] -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]] -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> -; CHECK-NEXT: store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]] -; -entry: - br label %for.body - -for.cond.cleanup: ; preds = %for.body - ret void - -for.body: ; preds = %entry, %for.body - %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ] - %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ] - %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ] - store i8 0, ptr %out.addr.032, align 1 - %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3 - %0 = load i8, ptr %arrayidx10, align 1 - %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1 - store i8 %0, ptr %arrayidx14, align 1 - %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2 - %1 = load i8, ptr %arrayidx10.1, align 1 - %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2 - store i8 %1, ptr %arrayidx14.1, align 1 - %add.2 = add i8 %0, %1 - %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3 - store i8 %add.2, ptr %arrayidx14.2, align 1 - %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1 - %2 = load i8, ptr %arrayidx10.3, align 1 - %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4 - store i8 %2, ptr %arrayidx14.3, align 1 - %add.4 = add i8 %0, %2 - %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5 - store i8 %add.4, ptr %arrayidx14.4, align 1 - %add.5 = add i8 %1, %2 - %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6 - store i8 %add.5, ptr %arrayidx14.5, align 1 - %add.6 = add i8 %0, %add.5 - %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7 - store i8 %add.6, ptr %arrayidx14.6, align 1 - %3 = load i8, ptr %in.addr.031, align 1 - %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8 - store i8 %3, ptr %arrayidx14.7, align 1 - %add.8 = add i8 %0, %3 - %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9 - store i8 %add.8, ptr %arrayidx14.8, align 1 - %add.9 = add i8 %1, %3 - %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10 - store i8 %add.9, ptr %arrayidx14.9, align 1 - %add.10 = add i8 %0, %add.9 - %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11 - store i8 %add.10, ptr %arrayidx14.10, align 1 - %add.11 = add i8 %2, %3 - %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12 - store i8 %add.11, ptr %arrayidx14.11, align 1 - %add.12 = add i8 %0, %add.11 - %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13 - store i8 %add.12, ptr %arrayidx14.12, align 1 - %add.13 = add i8 %1, %add.11 - %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14 - store i8 %add.13, ptr %arrayidx14.13, align 1 - %add.14 = add i8 %0, %add.13 - %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15 - store i8 %add.14, ptr %arrayidx14.14, align 1 - %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16 - %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4 - %inc17 = add nuw nsw i32 %i.033, 1 - %exitcond.not = icmp eq i32 %inc17, 32 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 -} - -!0 = distinct !{!0, !1} -!1 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index cdddcc9fc422..68cfc659e1e9 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 -; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s +; RUN: opt -p loop-vectorize -S %s | FileCheck %s target triple = "arm64-apple-macosx15.0.0" diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index 54b7f2afe1ed..f2ae327778f4 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]] -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]] -; CHECK-NEXT: store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]] +; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144 ; CHECK-NEXT: br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[FOR_END11]]: