[LowerMemIntrinsics][AMDGPU] Optimize memset.pattern lowering (#185901)

This patch changes the lowering of the [experimental.memset.pattern intrinsic](https://llvm.org/docs/LangRef.html#llvm-experimental-memset-pattern-intrinsic)
to match the optimized memset and memcpy lowering when possible. (The tl;dr of
memset.pattern is that it is like memset, except that you can use it to set
values that are wider than a single byte.)

The memset.pattern lowering now queries `TTI::getMemcpyLoopLoweringType` for a
preferred memory access type. If the size of that type is a multiple of the set
value's type, and if both types have consistent store and alloc sizes (since
memset.pattern behaves in a way that is not well suitable for access widening
if store and alloc size differ), the memset.pattern is lowered into two loops:
a main loop that stores a sufficiently wide vector splat of the SetValue with
the preferred memory access type and a residual loop that covers the remaining
set values individually.

In contrast to the memset lowering, this patch doesn't include a specialized
lowering for residual loops with known constant lengths. Loops that are
statically known to be unreachable will not be emitted.

For backends that don't override `TTI::getMemcpyLoopLoweringType`, the
generated code is mostly unchanged except for more consistent basic block
names, no more `br i1 false` for memset.patterns with known size, and a flipped
loop condition for memset.patterns with known size (see test changes).

This is a follow-up to a similar patch for memset: #169040
This commit is contained in:
Fabian Ritter 2026-03-13 10:37:33 +01:00 committed by GitHub
parent 26ac669101
commit f2749f6645
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 1652 additions and 145 deletions

View File

@ -72,7 +72,15 @@ LLVM_ABI void expandMemSetAsLoop(MemSetInst *MemSet,
const TargetTransformInfo &TTI);
/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet);
/// If \p TTI is provided, the memset.pattern is expanded according to the
/// target's preferences. Otherwise, it is expanded as an element-wise loop.
LLVM_ABI void
expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
const TargetTransformInfo *TTI = nullptr);
/// Expand \p MemSetPattern as a loop. \p MemSet is not deleted.
LLVM_ABI void expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
const TargetTransformInfo &TTI);
/// Expand \p AtomicMemCpy as a loop. \p AtomicMemCpy is not deleted.
LLVM_ABI void expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemCpy,

View File

@ -410,12 +410,14 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(
}
case Intrinsic::experimental_memset_pattern: {
auto *Memset = cast<MemSetPatternInst>(Inst);
const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction());
Function *ParentFunc = Memset->getFunction();
const TargetLibraryInfo &TLI = LookupTLI(*ParentFunc);
Constant *PatternValue = getMemSetPattern16Value(Memset, TLI);
if (!PatternValue) {
// If it isn't possible to emit a memset_pattern16 libcall, expand to
// a loop instead.
expandMemSetPatternAsLoop(Memset);
const TargetTransformInfo &TTI = LookupTTI(*ParentFunc);
expandMemSetPatternAsLoop(Memset, TTI);
Changed = true;
Memset->eraseFromParent();
break;

View File

@ -645,7 +645,8 @@ bool StoreFatPtrsAsIntsAndExpandMemcpyVisitor::visitMemSetPatternInst(
MemSetPatternInst &MSPI) {
if (MSPI.getDestAddressSpace() != AMDGPUAS::BUFFER_FAT_POINTER)
return false;
llvm::expandMemSetPatternAsLoop(&MSPI);
llvm::expandMemSetPatternAsLoop(
&MSPI, TM->getTargetTransformInfo(*MSPI.getFunction()));
MSPI.eraseFromParent();
return true;
}

View File

@ -1307,62 +1307,103 @@ createMemSetLoopUnknownSize(Instruction *InsertBefore, Value *DstAddr,
IsVolatile);
}
static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *CopyLen, Value *SetValue, Align DstAlign,
std::optional<uint64_t> AverageTripCount,
bool IsVolatile) {
// Currently no longer used for memset, only for memset.pattern.
// TODO: Update the memset.pattern lowering to also use the loop expansion
// framework and remove this function.
Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent();
const DataLayout &DL = F->getDataLayout();
BasicBlock *NewBB =
OrigBB->splitBasicBlock(InsertBefore, "split");
BasicBlock *LoopBB
= BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
static void createMemSetPatternLoop(Instruction *InsertBefore, Value *DstAddr,
Value *Len, Value *SetValue, Align DstAlign,
bool IsVolatile,
const TargetTransformInfo *TTI,
std::optional<uint64_t> AverageTripCount) {
// No need to expand zero length memset.pattern.
if (auto *CLen = dyn_cast<ConstantInt>(Len))
if (CLen->isZero())
return;
const DebugLoc &DbgLoc = InsertBefore->getStableDebugLoc();
IRBuilder<> Builder(OrigBB->getTerminator());
Builder.SetCurrentDebugLocation(DbgLoc);
BasicBlock *PreLoopBB = InsertBefore->getParent();
Function *ParentFunc = PreLoopBB->getParent();
const DataLayout &DL = ParentFunc->getDataLayout();
LLVMContext &Ctx = PreLoopBB->getContext();
auto *ToLoopBR = Builder.CreateCondBr(
Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
LoopBB);
MDBuilder MDB(F->getContext());
if (AverageTripCount.has_value())
ToLoopBR->setMetadata(LLVMContext::MD_prof,
MDB.createLikelyBranchWeights());
else
setExplicitlyUnknownBranchWeightsIfProfiled(*ToLoopBR, DEBUG_TYPE);
unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
OrigBB->getTerminator()->eraseFromParent();
Type *PreferredLoopOpType = SetValue->getType();
if (TTI) {
PreferredLoopOpType = TTI->getMemcpyLoopLoweringType(
Ctx, Len, DstAS, DstAS, DstAlign, DstAlign, std::nullopt);
}
TypeSize PreferredLoopOpStoreSize = DL.getTypeStoreSize(PreferredLoopOpType);
assert(PreferredLoopOpStoreSize.isFixed() &&
"PreferredLoopOpType cannot be a scalable vector type");
TypeSize PartSize = DL.getTypeStoreSize(SetValue->getType());
Align PartAlign(commonAlignment(DstAlign, PartSize));
TypeSize PreferredLoopOpAllocSize = DL.getTypeAllocSize(PreferredLoopOpType);
IRBuilder<> LoopBuilder(LoopBB);
LoopBuilder.SetCurrentDebugLocation(DbgLoc);
PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
Type *OriginalType = SetValue->getType();
TypeSize OriginalTypeStoreSize = DL.getTypeStoreSize(OriginalType);
TypeSize OriginalTypeAllocSize = DL.getTypeAllocSize(OriginalType);
LoopBuilder.CreateAlignedStore(
SetValue,
LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
PartAlign, IsVolatile);
// The semantics of memset.pattern restrict what vectorization we can do: It
// has to behave like a series of stores of the SetValue type at offsets that
// are spaced by the alloc size of the SetValue type. If store and alloc size
// of the SetValue type don't match, the bytes that aren't covered by these
// stores must not be overwritten. We therefore only vectorize memset.pattern
// if the store and alloc sizes of the SetValue are equal and properly divide
// the size of the preferred lowering type (and only if store and alloc size
// for the preferred lowering type are also equal).
Value *NewIndex =
LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
LoopIndex->addIncoming(NewIndex, LoopBB);
unsigned MainLoopStep = 1;
Type *MainLoopType = OriginalType;
TypeSize MainLoopAllocSize = OriginalTypeAllocSize;
unsigned ResidualLoopStep = 0;
Type *ResidualLoopType = nullptr;
auto *LoopBR = LoopBuilder.CreateCondBr(
LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, NewBB);
if (AverageTripCount.has_value())
setFittedBranchWeights(*LoopBR, {AverageTripCount.value(), 1},
/*IsExpected=*/false);
else
setExplicitlyUnknownBranchWeightsIfProfiled(*LoopBR, DEBUG_TYPE);
if (PreferredLoopOpStoreSize == PreferredLoopOpAllocSize &&
OriginalTypeStoreSize == OriginalTypeAllocSize &&
OriginalTypeStoreSize < PreferredLoopOpStoreSize &&
PreferredLoopOpStoreSize % OriginalTypeStoreSize == 0) {
// Multiple instances of SetValue can be combined to reach the preferred
// loop op size.
MainLoopStep = PreferredLoopOpStoreSize / OriginalTypeStoreSize;
MainLoopType = PreferredLoopOpType;
MainLoopAllocSize = PreferredLoopOpStoreSize;
ResidualLoopStep = 1;
ResidualLoopType = OriginalType;
}
// The step arguments here are in terms of the alloc size of the SetValue, not
// in terms of bytes.
LoopExpansionInfo LEI =
insertLoopExpansion(InsertBefore, Len, MainLoopStep, ResidualLoopStep,
"memset.pattern", AverageTripCount);
Align PartDstAlign(commonAlignment(DstAlign, MainLoopAllocSize));
if (LEI.MainLoopIP) {
// Create the loop-invariant splat value before the loop.
IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
Value *MainLoopSetValue = SetValue;
if (MainLoopType != OriginalType)
MainLoopSetValue =
createMemSetSplat(DL, PreLoopBuilder, SetValue, MainLoopType);
// Fill MainLoopBB
IRBuilder<> MainLoopBuilder(LEI.MainLoopIP);
Value *DstGEP = MainLoopBuilder.CreateInBoundsGEP(MainLoopType, DstAddr,
LEI.MainLoopIndex);
MainLoopBuilder.CreateAlignedStore(MainLoopSetValue, DstGEP, PartDstAlign,
IsVolatile);
}
if (!LEI.ResidualLoopIP)
return;
// Fill ResidualLoopBB
Align ResDstAlign(
commonAlignment(PartDstAlign, DL.getTypeAllocSize(ResidualLoopType)));
IRBuilder<> ResLoopBuilder(LEI.ResidualLoopIP);
Value *ResDstGEP = ResLoopBuilder.CreateInBoundsGEP(ResidualLoopType, DstAddr,
LEI.ResidualLoopIndex);
ResLoopBuilder.CreateAlignedStore(SetValue, ResDstGEP, ResDstAlign,
IsVolatile);
}
template <typename T>
@ -1501,14 +1542,22 @@ void llvm::expandMemSetAsLoop(MemSetInst *MemSet,
expandMemSetAsLoop(MemSet, &TTI);
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset) {
createMemSetLoop(/*InsertBefore=*/Memset,
/*DstAddr=*/Memset->getRawDest(),
/*CopyLen=*/Memset->getLength(),
/*SetValue=*/Memset->getValue(),
/*DstAlign=*/Memset->getDestAlign().valueOrOne(),
/*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset),
/*IsVolatile=*/Memset->isVolatile());
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *Memset,
const TargetTransformInfo *TTI) {
createMemSetPatternLoop(
/*InsertBefore=*/Memset,
/*DstAddr=*/Memset->getRawDest(),
/*Len=*/Memset->getLength(),
/*SetValue=*/Memset->getValue(),
/*DstAlign=*/Memset->getDestAlign().valueOrOne(),
/*IsVolatile=*/Memset->isVolatile(),
/*TTI=*/TTI,
/*AverageTripCount=*/getAverageMemOpLoopTripCount(*Memset));
}
void llvm::expandMemSetPatternAsLoop(MemSetPatternInst *MemSet,
const TargetTransformInfo &TTI) {
expandMemSetPatternAsLoop(MemSet, &TTI);
}
void llvm::expandAtomicMemCpyAsLoop(AnyMemCpyInst *AtomicMemcpy,

View File

@ -1724,15 +1724,45 @@ define void @memset_pattern_known(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 4 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 8192
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[DOTIDX1:%.*]] = mul nsw i32 [[LOOP_INDEX]], 256
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[TMP5]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP5]], 16
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP5]], 32
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP5]], 48
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP5]], 64
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP5]], 80
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP5]], 96
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP5]], 112
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP5]], 128
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP5]], 144
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP5]], 160
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP5]], 176
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP5]], 192
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP5]], 208
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP5]], 224
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP5]], 240
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 16 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[LOOP_INDEX]], 64
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP4]], 8192
; CHECK-NEXT: br i1 [[TMP3]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -1745,15 +1775,15 @@ define void @memset_pattern_known_small(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: br label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]]
; CHECK: [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -1766,16 +1796,46 @@ define void @memset_pattern_known_i64(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[DOTC:%.*]] = trunc i64 [[TMP1]] to i32
; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[DOTC]], 4
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 0)
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 8192
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: [[DOTIDX1:%.*]] = mul nsw i32 [[DOTC]], 256
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP8]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_4:%.*]] = add nuw i32 [[TMP8]], 16
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_4]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_8:%.*]] = add nuw i32 [[TMP8]], 32
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_8]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_12:%.*]] = add nuw i32 [[TMP8]], 48
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_12]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_16:%.*]] = add nuw i32 [[TMP8]], 64
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_16]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_20:%.*]] = add nuw i32 [[TMP8]], 80
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_20]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_24:%.*]] = add nuw i32 [[TMP8]], 96
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_24]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_28:%.*]] = add nuw i32 [[TMP8]], 112
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_28]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_32:%.*]] = add nuw i32 [[TMP8]], 128
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_32]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_36:%.*]] = add nuw i32 [[TMP8]], 144
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_36]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_40:%.*]] = add nuw i32 [[TMP8]], 160
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_40]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_44:%.*]] = add nuw i32 [[TMP8]], 176
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_44]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_48:%.*]] = add nuw i32 [[TMP8]], 192
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_48]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_52:%.*]] = add nuw i32 [[TMP8]], 208
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_52]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_56:%.*]] = add nuw i32 [[TMP8]], 224
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_56]], i32 0, i32 0)
; CHECK-NEXT: [[DOTPART_60:%.*]] = add nuw i32 [[TMP8]], 240
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[DOTPART_60]], i32 0, i32 0)
; CHECK-NEXT: [[TMP5]] = add i64 [[TMP1]], 64
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP5]], 8192
; CHECK-NEXT: br i1 [[TMP3]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -1788,15 +1848,15 @@ define void @memset_pattern_known_i32_volatile(ptr addrspace(7) inreg %ptr) {
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: br label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]]
; CHECK: [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP1]], 4
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP2]], i32 0, i32 -2147483648)
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 32
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -1809,16 +1869,30 @@ define void @memset_pattern_unknown(ptr addrspace(7) inreg %ptr, i32 inreg %leng
; CHECK-SAME: { ptr addrspace(8), i32 } inreg [[PTR:%.*]], i32 inreg [[LENGTH:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[PTR_RSRC:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 0
; CHECK-NEXT: [[PTR_OFF:%.*]] = extractvalue { ptr addrspace(8), i32 } [[PTR]], 1
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 0, [[LENGTH]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[LENGTH]], 3
; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[LENGTH]], [[TMP1]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp ne i32 [[TMP8]], 0
; CHECK-NEXT: br i1 [[TMP9]], label %[[LOADSTORELOOP:.*]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[DOTIDX1:%.*]] = mul nsw i32 [[LOOP_INDEX]], 16
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX1]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> splat (i32 1), ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP4]], i32 0, i32 0)
; CHECK-NEXT: [[TMP5]] = add i32 [[LOOP_INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[TMP8]]
; CHECK-NEXT: br i1 [[TMP6]], label %[[LOADSTORELOOP]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
; CHECK: [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]:
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i32 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP7]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:.*]], label %[[SPLIT:.*]]
; CHECK: [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]]:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i32 [ 0, %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP8]], [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: [[DOTIDX:%.*]] = mul nsw i32 [[TMP2]], 4
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[PTR_OFF]], [[DOTIDX]]
; CHECK-NEXT: call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 1, ptr addrspace(8) align 1 [[PTR_RSRC]], i32 [[TMP3]], i32 0, i32 0)
; CHECK-NEXT: [[TMP4]] = add i32 [[TMP2]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i32 [[TMP4]], [[LENGTH]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: [[TMP10]] = add i32 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP11]], label %[[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;

File diff suppressed because it is too large Load Diff

View File

@ -13,14 +13,14 @@
define void @memset_1(ptr %a, i128 %value) nounwind {
; RV32-BOTH-LABEL: memset_1:
; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader
; RV32-BOTH: # %bb.0:
; RV32-BOTH-NEXT: li a2, 0
; RV32-BOTH-NEXT: lw a3, 0(a1)
; RV32-BOTH-NEXT: lw a4, 4(a1)
; RV32-BOTH-NEXT: lw a5, 8(a1)
; RV32-BOTH-NEXT: lw a1, 12(a1)
; RV32-BOTH-NEXT: li a6, 0
; RV32-BOTH-NEXT: .LBB0_1: # %loadstoreloop
; RV32-BOTH-NEXT: .LBB0_1: # %memset.pattern-expansion-main-body
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-BOTH-NEXT: slli a7, a2, 4
; RV32-BOTH-NEXT: addi a2, a2, 1
@ -33,19 +33,19 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
; RV32-BOTH-NEXT: sw a5, 8(a7)
; RV32-BOTH-NEXT: sw a1, 12(a7)
; RV32-BOTH-NEXT: beqz t0, .LBB0_1
; RV32-BOTH-NEXT: # %bb.2: # %split
; RV32-BOTH-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: memset_1:
; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader
; RV64-BOTH: # %bb.0:
; RV64-BOTH-NEXT: addi a3, a0, 16
; RV64-BOTH-NEXT: .LBB0_1: # %loadstoreloop
; RV64-BOTH-NEXT: .LBB0_1: # %memset.pattern-expansion-main-body
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-BOTH-NEXT: sd a1, 0(a0)
; RV64-BOTH-NEXT: sd a2, 8(a0)
; RV64-BOTH-NEXT: addi a0, a0, 16
; RV64-BOTH-NEXT: bne a0, a3, .LBB0_1
; RV64-BOTH-NEXT: # %bb.2: # %split
; RV64-BOTH-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV64-BOTH-NEXT: ret
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 1, i1 0)
ret void
@ -53,7 +53,7 @@ define void @memset_1(ptr %a, i128 %value) nounwind {
define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV32-LABEL: memset_1_noalign:
; RV32: # %bb.0: # %loadstoreloop.preheader
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32
; RV32-NEXT: sw s0, 28(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 24(sp) # 4-byte Folded Spill
@ -79,7 +79,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV32-NEXT: srli s1, a1, 24
; RV32-NEXT: srli s2, a1, 16
; RV32-NEXT: srli s3, a1, 8
; RV32-NEXT: .LBB1_1: # %loadstoreloop
; RV32-NEXT: .LBB1_1: # %memset.pattern-expansion-main-body
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-NEXT: slli s4, a2, 4
; RV32-NEXT: addi a2, a2, 1
@ -104,7 +104,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV32-NEXT: sb s2, 14(s4)
; RV32-NEXT: sb s1, 15(s4)
; RV32-NEXT: beqz s5, .LBB1_1
; RV32-NEXT: # %bb.2: # %split
; RV32-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV32-NEXT: lw s0, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 24(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 20(sp) # 4-byte Folded Reload
@ -115,7 +115,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV32-NEXT: ret
;
; RV64-LABEL: memset_1_noalign:
; RV64: # %bb.0: # %loadstoreloop.preheader
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -32
; RV64-NEXT: sd s0, 24(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s1, 16(sp) # 8-byte Folded Spill
@ -135,7 +135,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV64-NEXT: srli s0, a2, 24
; RV64-NEXT: srli s1, a2, 16
; RV64-NEXT: srli s2, a2, 8
; RV64-NEXT: .LBB1_1: # %loadstoreloop
; RV64-NEXT: .LBB1_1: # %memset.pattern-expansion-main-body
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-NEXT: sb a7, 4(a0)
; RV64-NEXT: sb a6, 5(a0)
@ -155,7 +155,7 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV64-NEXT: sb s0, 11(a0)
; RV64-NEXT: addi a0, a0, 16
; RV64-NEXT: bne a0, a3, .LBB1_1
; RV64-NEXT: # %bb.2: # %split
; RV64-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV64-NEXT: ld s0, 24(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s1, 16(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s2, 8(sp) # 8-byte Folded Reload
@ -163,14 +163,14 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV64-NEXT: ret
;
; RV32-FAST-LABEL: memset_1_noalign:
; RV32-FAST: # %bb.0: # %loadstoreloop.preheader
; RV32-FAST: # %bb.0:
; RV32-FAST-NEXT: li a2, 0
; RV32-FAST-NEXT: lw a3, 0(a1)
; RV32-FAST-NEXT: lw a4, 4(a1)
; RV32-FAST-NEXT: lw a5, 8(a1)
; RV32-FAST-NEXT: lw a1, 12(a1)
; RV32-FAST-NEXT: li a6, 0
; RV32-FAST-NEXT: .LBB1_1: # %loadstoreloop
; RV32-FAST-NEXT: .LBB1_1: # %memset.pattern-expansion-main-body
; RV32-FAST-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-FAST-NEXT: slli a7, a2, 4
; RV32-FAST-NEXT: addi a2, a2, 1
@ -183,19 +183,19 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
; RV32-FAST-NEXT: sw a5, 8(a7)
; RV32-FAST-NEXT: sw a1, 12(a7)
; RV32-FAST-NEXT: beqz t0, .LBB1_1
; RV32-FAST-NEXT: # %bb.2: # %split
; RV32-FAST-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV32-FAST-NEXT: ret
;
; RV64-FAST-LABEL: memset_1_noalign:
; RV64-FAST: # %bb.0: # %loadstoreloop.preheader
; RV64-FAST: # %bb.0:
; RV64-FAST-NEXT: addi a3, a0, 16
; RV64-FAST-NEXT: .LBB1_1: # %loadstoreloop
; RV64-FAST-NEXT: .LBB1_1: # %memset.pattern-expansion-main-body
; RV64-FAST-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-FAST-NEXT: sd a1, 0(a0)
; RV64-FAST-NEXT: sd a2, 8(a0)
; RV64-FAST-NEXT: addi a0, a0, 16
; RV64-FAST-NEXT: bne a0, a3, .LBB1_1
; RV64-FAST-NEXT: # %bb.2: # %split
; RV64-FAST-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV64-FAST-NEXT: ret
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
ret void
@ -203,14 +203,14 @@ define void @memset_1_noalign(ptr %a, i128 %value) nounwind {
define void @memset_4(ptr %a, i128 %value) nounwind {
; RV32-BOTH-LABEL: memset_4:
; RV32-BOTH: # %bb.0: # %loadstoreloop.preheader
; RV32-BOTH: # %bb.0:
; RV32-BOTH-NEXT: li a2, 0
; RV32-BOTH-NEXT: lw a3, 0(a1)
; RV32-BOTH-NEXT: lw a4, 4(a1)
; RV32-BOTH-NEXT: lw a5, 8(a1)
; RV32-BOTH-NEXT: lw a1, 12(a1)
; RV32-BOTH-NEXT: li a6, 0
; RV32-BOTH-NEXT: .LBB2_1: # %loadstoreloop
; RV32-BOTH-NEXT: .LBB2_1: # %memset.pattern-expansion-main-body
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-BOTH-NEXT: slli a7, a2, 4
; RV32-BOTH-NEXT: addi a2, a2, 1
@ -225,19 +225,19 @@ define void @memset_4(ptr %a, i128 %value) nounwind {
; RV32-BOTH-NEXT: sw a5, 8(a7)
; RV32-BOTH-NEXT: sw a1, 12(a7)
; RV32-BOTH-NEXT: bnez t0, .LBB2_1
; RV32-BOTH-NEXT: # %bb.2: # %split
; RV32-BOTH-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: memset_4:
; RV64-BOTH: # %bb.0: # %loadstoreloop.preheader
; RV64-BOTH: # %bb.0:
; RV64-BOTH-NEXT: addi a3, a0, 64
; RV64-BOTH-NEXT: .LBB2_1: # %loadstoreloop
; RV64-BOTH-NEXT: .LBB2_1: # %memset.pattern-expansion-main-body
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-BOTH-NEXT: sd a1, 0(a0)
; RV64-BOTH-NEXT: sd a2, 8(a0)
; RV64-BOTH-NEXT: addi a0, a0, 16
; RV64-BOTH-NEXT: bne a0, a3, .LBB2_1
; RV64-BOTH-NEXT: # %bb.2: # %split
; RV64-BOTH-NEXT: # %bb.2: # %memset.pattern-post-expansion
; RV64-BOTH-NEXT: ret
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 4, i1 0)
ret void
@ -248,7 +248,7 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
; RV32-BOTH: # %bb.0:
; RV32-BOTH-NEXT: or a4, a2, a3
; RV32-BOTH-NEXT: beqz a4, .LBB3_5
; RV32-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader
; RV32-BOTH-NEXT: # %bb.1: # %memset.pattern-expansion-main-body.preheader
; RV32-BOTH-NEXT: li a4, 0
; RV32-BOTH-NEXT: lw a5, 0(a1)
; RV32-BOTH-NEXT: lw a6, 4(a1)
@ -256,11 +256,11 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
; RV32-BOTH-NEXT: lw a1, 12(a1)
; RV32-BOTH-NEXT: li t0, 0
; RV32-BOTH-NEXT: j .LBB3_3
; RV32-BOTH-NEXT: .LBB3_2: # %loadstoreloop
; RV32-BOTH-NEXT: .LBB3_2: # %memset.pattern-expansion-main-body
; RV32-BOTH-NEXT: # in Loop: Header=BB3_3 Depth=1
; RV32-BOTH-NEXT: sltu t1, t0, a3
; RV32-BOTH-NEXT: beqz t1, .LBB3_5
; RV32-BOTH-NEXT: .LBB3_3: # %loadstoreloop
; RV32-BOTH-NEXT: .LBB3_3: # %memset.pattern-expansion-main-body
; RV32-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV32-BOTH-NEXT: slli t1, a4, 4
; RV32-BOTH-NEXT: addi a4, a4, 1
@ -275,22 +275,22 @@ define void @memset_x(ptr %a, i128 %value, i64 %x) nounwind {
; RV32-BOTH-NEXT: # %bb.4: # in Loop: Header=BB3_3 Depth=1
; RV32-BOTH-NEXT: sltu t1, a4, a2
; RV32-BOTH-NEXT: bnez t1, .LBB3_3
; RV32-BOTH-NEXT: .LBB3_5: # %split
; RV32-BOTH-NEXT: .LBB3_5: # %memset.pattern-post-expansion
; RV32-BOTH-NEXT: ret
;
; RV64-BOTH-LABEL: memset_x:
; RV64-BOTH: # %bb.0:
; RV64-BOTH-NEXT: beqz a3, .LBB3_3
; RV64-BOTH-NEXT: # %bb.1: # %loadstoreloop.preheader
; RV64-BOTH-NEXT: # %bb.1: # %memset.pattern-expansion-main-body.preheader
; RV64-BOTH-NEXT: li a4, 0
; RV64-BOTH-NEXT: .LBB3_2: # %loadstoreloop
; RV64-BOTH-NEXT: .LBB3_2: # %memset.pattern-expansion-main-body
; RV64-BOTH-NEXT: # =>This Inner Loop Header: Depth=1
; RV64-BOTH-NEXT: sd a1, 0(a0)
; RV64-BOTH-NEXT: sd a2, 8(a0)
; RV64-BOTH-NEXT: addi a4, a4, 1
; RV64-BOTH-NEXT: addi a0, a0, 16
; RV64-BOTH-NEXT: bltu a4, a3, .LBB3_2
; RV64-BOTH-NEXT: .LBB3_3: # %split
; RV64-BOTH-NEXT: .LBB3_3: # %memset.pattern-post-expansion
; RV64-BOTH-NEXT: ret
tail call void @llvm.experimental.memset.pattern(ptr align 8 %a, i128 %value, i64 %x, i1 0)
ret void

View File

@ -0,0 +1,273 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -p=pre-isel-intrinsic-lowering -S < %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -p=pre-isel-intrinsic-lowering -S -amdgpu-memcpy-loop-unroll=2 %s -o - | FileCheck -check-prefixes=CHECK,UNROLL2 %s
define void @memset_pattern_i128_len0_dynvalue(ptr align 16 %a, i128 %value) {
; CHECK-LABEL: @memset_pattern_i128_len0_dynvalue(
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 %value, i64 0, i1 false)
ret void
}
define void @memset_pattern_i128_len1_dynvalue(ptr align 16 %a, i128 %value) {
; CHECK-LABEL: @memset_pattern_i128_len1_dynvalue(
; CHECK-NEXT: br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: store i128 [[VALUE:%.*]], ptr [[TMP1]], align 16
; CHECK-NEXT: [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 %value, i64 1, i1 false)
ret void
}
define void @memset_pattern_i128_len1(ptr align 16 %a) {
; CHECK-LABEL: @memset_pattern_i128_len1(
; CHECK-NEXT: br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr [[TMP1]], align 16
; CHECK-NEXT: [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
ret void
}
define void @memset_pattern_i128_constlen_mainloop_and_residual_taken(ptr align 16 %a) {
; DEFAULT-LABEL: @memset_pattern_i128_constlen_mainloop_and_residual_taken(
; DEFAULT-NEXT: br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
; DEFAULT: memset.pattern-expansion-main-body:
; DEFAULT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds <64 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; DEFAULT-NEXT: store <64 x i32> bitcast (<16 x i128> splat (i128 -113427455635030943652277463699152839203) to <64 x i32>), ptr [[TMP1]], align 16
; DEFAULT-NEXT: [[TMP2]] = add i64 [[LOOP_INDEX]], 16
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
; DEFAULT: memset.pattern-expansion-residual-body:
; DEFAULT-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ], [ [[TMP6:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; DEFAULT-NEXT: [[TMP4:%.*]] = add i64 16, [[RESIDUAL_LOOP_INDEX]]
; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP4]]
; DEFAULT-NEXT: store i128 -113427455635030943652277463699152839203, ptr [[TMP5]], align 16
; DEFAULT-NEXT: [[TMP6]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; DEFAULT-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 3
; DEFAULT-NEXT: br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; DEFAULT: memset.pattern-post-expansion:
; DEFAULT-NEXT: ret void
;
; UNROLL2-LABEL: @memset_pattern_i128_constlen_mainloop_and_residual_taken(
; UNROLL2-NEXT: br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
; UNROLL2: memset.pattern-expansion-main-body:
; UNROLL2-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; UNROLL2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; UNROLL2-NEXT: store <8 x i32> bitcast (<2 x i128> splat (i128 -113427455635030943652277463699152839203) to <8 x i32>), ptr [[TMP1]], align 16
; UNROLL2-NEXT: [[TMP2]] = add i64 [[LOOP_INDEX]], 2
; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 18
; UNROLL2-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; UNROLL2: memset.pattern-expansion-residual-body:
; UNROLL2-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ], [ [[TMP6:%.*]], [[MEMSET_PATTERN_POST_EXPANSION]] ]
; UNROLL2-NEXT: [[TMP4:%.*]] = add i64 18, [[RESIDUAL_LOOP_INDEX]]
; UNROLL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP4]]
; UNROLL2-NEXT: store i128 -113427455635030943652277463699152839203, ptr [[TMP5]], align 16
; UNROLL2-NEXT: [[TMP6]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; UNROLL2-NEXT: [[TMP7:%.*]] = icmp ult i64 [[TMP6]], 1
; UNROLL2-NEXT: br i1 [[TMP7]], label [[MEMSET_PATTERN_POST_EXPANSION]], label [[MEMSET_PATTERN_POST_EXPANSION1:%.*]]
; UNROLL2: memset.pattern-post-expansion:
; UNROLL2-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 19, i1 false)
ret void
}
define void @memset_pattern_i128_len1_nz_as(ptr addrspace(3) align 16 %a) {
; CHECK-LABEL: @memset_pattern_i128_len1_nz_as(
; CHECK-NEXT: br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr addrspace(3) [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(3) [[TMP1]], align 16
; CHECK-NEXT: [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
ret void
}
define void @memset_pattern_i128_len1_no_align(ptr %a) {
; CHECK-LABEL: @memset_pattern_i128_len1_no_align(
; CHECK-NEXT: br label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP2]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 1
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
ret void
}
define void @memset_pattern_i128_len16(ptr align 16 %a) {
; DEFAULT-LABEL: @memset_pattern_i128_len16(
; DEFAULT-NEXT: br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
; DEFAULT: memset.pattern-expansion-main-body:
; DEFAULT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds <64 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; DEFAULT-NEXT: store <64 x i32> bitcast (<16 x i128> splat (i128 -113427455635030943652277463699152839203) to <64 x i32>), ptr [[TMP1]], align 16
; DEFAULT-NEXT: [[TMP2]] = add i64 [[LOOP_INDEX]], 16
; DEFAULT-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
; DEFAULT-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; DEFAULT: memset.pattern-post-expansion:
; DEFAULT-NEXT: ret void
;
; UNROLL2-LABEL: @memset_pattern_i128_len16(
; UNROLL2-NEXT: br label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]]
; UNROLL2: memset.pattern-expansion-main-body:
; UNROLL2-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; UNROLL2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; UNROLL2-NEXT: store <8 x i32> bitcast (<2 x i128> splat (i128 -113427455635030943652277463699152839203) to <8 x i32>), ptr [[TMP1]], align 16
; UNROLL2-NEXT: [[TMP2]] = add i64 [[LOOP_INDEX]], 2
; UNROLL2-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 16
; UNROLL2-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; UNROLL2: memset.pattern-post-expansion:
; UNROLL2-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
ret void
}
define void @memset_pattern_i128_dynlen(ptr align 16 %a, i64 %len) {
; CHECK-LABEL: @memset_pattern_i128_dynlen(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-expansion-main-body:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
ret void
}
define void @memset_pattern_i128_dynlen_nz_as(ptr addrspace(3) align 16 %a, i64 %len) {
; CHECK-LABEL: @memset_pattern_i128_dynlen_nz_as(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-expansion-main-body:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(3) [[A:%.*]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(3) [[TMP2]], align 16
; CHECK-NEXT: [[TMP3]] = add i64 [[LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr addrspace(3) align 16 %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %len, i1 false)
ret void
}
define void @memset_pattern_i32_dynlen(ptr align 16 %a, i64 %len) {
; CHECK-LABEL: @memset_pattern_i32_dynlen(
; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[LEN:%.*]], 3
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[LEN]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:%.*]]
; CHECK: memset.pattern-expansion-main-body:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store <4 x i32> splat (i32 -1430532899), ptr [[TMP4]], align 16
; CHECK-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
; CHECK: memset.pattern-expansion-residual-cond:
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; CHECK-NEXT: store i32 -1430532899, ptr [[TMP9]], align 4
; CHECK-NEXT: [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i32 u0xaabbccdd, i64 %len, i1 false)
ret void
}
define void @memset_pattern_i32_dynval_dynlen(ptr align 16 %a, i32 %val, i64 %len) {
; CHECK-LABEL: @memset_pattern_i32_dynval_dynlen(
; CHECK-NEXT: [[TMP1:%.*]] = and i64 [[LEN:%.*]], 3
; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[LEN]], [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i64 [[TMP2]], 0
; CHECK-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0
; CHECK-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <4 x i32> [[SETVALUE_SPLAT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br i1 [[TMP3]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND:%.*]]
; CHECK: memset.pattern-expansion-main-body:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <4 x i32>, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store <4 x i32> [[SETVALUE_SPLAT_SPLAT]], ptr [[TMP4]], align 16
; CHECK-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[TMP2]]
; CHECK-NEXT: br i1 [[TMP6]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]]
; CHECK: memset.pattern-expansion-residual-cond:
; CHECK-NEXT: [[TMP7:%.*]] = icmp ne i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[TMP7]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-expansion-residual-body:
; CHECK-NEXT: [[RESIDUAL_LOOP_INDEX:%.*]] = phi i64 [ 0, [[MEMSET_PATTERN_EXPANSION_RESIDUAL_COND]] ], [ [[TMP10:%.*]], [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]] ]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], [[RESIDUAL_LOOP_INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP8]]
; CHECK-NEXT: store i32 [[VAL]], ptr [[TMP9]], align 4
; CHECK-NEXT: [[TMP10]] = add i64 [[RESIDUAL_LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i64 [[TMP10]], [[TMP1]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MEMSET_PATTERN_EXPANSION_RESIDUAL_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i32 %val, i64 %len, i1 false)
ret void
}
; For i96, the store size and the alloc size differ on amdgpu, this case is not
; optimized.
define void @memset_pattern_i96_dynval_dynlen(ptr align 16 %a, i96 %val, i64 %len) {
; CHECK-LABEL: @memset_pattern_i96_dynval_dynlen(
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[LEN:%.*]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY:%.*]], label [[MEMSET_PATTERN_POST_EXPANSION:%.*]]
; CHECK: memset.pattern-expansion-main-body:
; CHECK-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i96, ptr [[A:%.*]], i64 [[LOOP_INDEX]]
; CHECK-NEXT: store i96 [[VAL:%.*]], ptr [[TMP2]], align 16
; CHECK-NEXT: [[TMP3]] = add i64 [[LOOP_INDEX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], [[LEN]]
; CHECK-NEXT: br i1 [[TMP4]], label [[MEMSET_PATTERN_EXPANSION_MAIN_BODY]], label [[MEMSET_PATTERN_POST_EXPANSION]]
; CHECK: memset.pattern-post-expansion:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 16 %a, i96 %val, i64 %len, i1 false)
ret void
}

View File

@ -7,8 +7,8 @@
define void @memset.pattern(ptr %a, i128 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset.pattern(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP3]]

View File

@ -4,14 +4,14 @@
define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_1(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -22,14 +22,14 @@ define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_16(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -40,8 +40,8 @@ define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i127_x(
; CHECK-SAME: ptr [[A:%.*]], i127 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i127, ptr [[A]], i64 [[TMP3]]
@ -59,8 +59,8 @@ define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_x(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
@ -78,8 +78,8 @@ define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i256_x(
; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i256, ptr [[A]], i64 [[TMP2]]
@ -99,8 +99,8 @@ define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i15_x_alignment(
; CHECK-SAME: ptr [[A:%.*]], i15 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP3]]
@ -109,8 +109,8 @@ define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP7]], label %[[SPLIT1:.*]], label %[[LOADSTORELOOP2:.*]]
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP6]], label %[[LOADSTORELOOP2:.*]], label %[[SPLIT1:.*]]
; CHECK: [[LOADSTORELOOP2]]:
; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP11]]

View File

@ -12,14 +12,14 @@
define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -40,14 +40,14 @@ define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as(
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]]
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
@ -89,8 +89,8 @@ define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
define void @memset_pattern_i128_x_nonzero_as(ptr addrspace(10) %a, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_x_nonzero_as(
; CHECK-SAME: ptr addrspace(10) [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i128, ptr addrspace(10) [[A]], i64 [[TMP2]]
@ -195,8 +195,8 @@ define void @memset_pattern_i64_x_fromptr(ptr %a, i64 %x) nounwind {
define void @memset_pattern_i64_x_fromnonconstptr(ptr %a, i64 %x, ptr %p) nounwind {
; CHECK-LABEL: define void @memset_pattern_i64_x_fromnonconstptr(
; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 0, [[X]]
; CHECK-NEXT: br i1 [[TMP2]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[TMP3]]