diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 9ad9e59d9cea..02c17575469a 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -2294,6 +2294,13 @@ public: /// not specified. LLVM_ABI Value *CreateAggregateCast(Value *V, Type *DestTy); + /// Create a chain of casts to convert V to NewTy, preserving the bit pattern + /// of V. This may involve multiple casts (e.g., ptr -> i64 -> <2 x i32>). + /// The created cast instructions are inserted into the current basic block. + /// If no casts are needed, V is returned. + LLVM_ABI Value *CreateBitPreservingCastChain(const DataLayout &DL, Value *V, + Type *NewTy); + //===--------------------------------------------------------------------===// // Instruction creation methods: Compare Instructions //===--------------------------------------------------------------------===// diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index c87fd1b444c0..fb282441b908 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -103,6 +103,79 @@ Value *IRBuilderBase::CreateAggregateCast(Value *V, Type *DestTy) { return CreateBitOrPointerCast(V, DestTy); } +Value *IRBuilderBase::CreateBitPreservingCastChain(const DataLayout &DL, + Value *V, Type *NewTy) { + Type *OldTy = V->getType(); + + if (OldTy == NewTy) + return V; + + assert(!(isa(OldTy) && isa(NewTy)) && + "Integer types must be the exact same to convert."); + + // A variant of bitcast that supports a mixture of fixed and scalable types + // that are know to have the same size. + auto CreateBitCastLike = [this](Value *In, Type *Ty) -> Value * { + Type *InTy = In->getType(); + if (InTy == Ty) + return In; + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand <4 x i32> to --> + // <4 x i32> to to + auto *VTy = VectorType::getWithSizeAndScalar(cast(Ty), InTy); + return CreateBitCast( + CreateInsertVector(VTy, PoisonValue::get(VTy), In, getInt64(0)), Ty); + } + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand to <4 x i32> --> + // to to <4 x i32> + auto *VTy = VectorType::getWithSizeAndScalar(cast(InTy), Ty); + return CreateExtractVector(Ty, CreateBitCast(In, VTy), getInt64(0)); + } + + return CreateBitCast(In, Ty); + }; + + // See if we need inttoptr for this type pair. May require additional bitcast. + if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { + // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* + // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> + // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> + // Directly handle i64 to i8* + return CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), NewTy); + } + + // See if we need ptrtoint for this type pair. May require additional bitcast. + if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { + // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 + // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> + // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> + // Expand i8* to i64 --> i8* to i64 to i64 + return CreateBitCastLike(CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); + } + + if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { + unsigned OldAS = OldTy->getPointerAddressSpace(); + unsigned NewAS = NewTy->getPointerAddressSpace(); + // To convert pointers with different address spaces (they are already + // checked convertible, i.e. they have the same pointer size), so far we + // cannot use `bitcast` (which has restrict on the same address space) or + // `addrspacecast` (which is not always no-op casting). Instead, use a pair + // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit + // size. + if (OldAS != NewAS) { + return CreateIntToPtr( + CreateBitCastLike(CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + DL.getIntPtrType(NewTy)), + NewTy); + } + } + + return CreateBitCastLike(V, NewTy); +} + CallInst * IRBuilderBase::createCallHelper(Function *Callee, ArrayRef Ops, const Twine &Name, FMFSource FMFSource, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index d18d3a13b29e..ed676c3fde2f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -606,21 +606,6 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, - Type *PtrTy) -> Value * { - assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); - const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy); - if (!PtrTy->isVectorTy()) - return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size)); - const unsigned NumPtrElts = cast(PtrTy)->getNumElements(); - // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to - // first cast the ptr vector to <2 x i64>. - assert((Size % NumPtrElts == 0) && "Vector size not divisble"); - Type *EltTy = Builder.getIntNTy(Size / NumPtrElts); - return Builder.CreateBitOrPointerCast( - Val, FixedVectorType::get(EltTy, NumPtrElts)); - }; - Type *VecEltTy = AA.Vector.Ty->getElementType(); switch (Inst->getOpcode()) { @@ -634,12 +619,8 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); if (Constant *CI = dyn_cast(Index)) { if (CI->isZeroValue() && AccessSize == VecStoreSize) { - if (AccessTy->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, AccessTy); - else if (CurVal->getType()->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); - Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy); - Inst->replaceAllUsesWith(NewVal); + Inst->replaceAllUsesWith( + Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy)); return nullptr; } } @@ -689,13 +670,8 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K); } - if (AccessTy->isPtrOrPtrVectorTy()) - SubVec = CreateTempPtrIntCast(SubVec, AccessTy); - else if (SubVecTy->isPtrOrPtrVectorTy()) - SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); - - SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy); - Inst->replaceAllUsesWith(SubVec); + Inst->replaceAllUsesWith( + Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy)); return nullptr; } @@ -719,15 +695,9 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, // We're storing the full vector, we can handle this without knowing CurVal. Type *AccessTy = Val->getType(); TypeSize AccessSize = DL.getTypeStoreSize(AccessTy); - if (Constant *CI = dyn_cast(Index)) { - if (CI->isZeroValue() && AccessSize == VecStoreSize) { - if (AccessTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, AccessTy); - else if (AA.Vector.Ty->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, AA.Vector.Ty); - return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty); - } - } + if (Constant *CI = dyn_cast(Index)) + if (CI->isZeroValue() && AccessSize == VecStoreSize) + return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty); // Storing a subvector. if (isa(AccessTy)) { @@ -738,13 +708,7 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts); assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); - if (SubVecTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, SubVecTy); - else if (AccessTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, AccessTy); - - Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - + Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy); Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 83eabdae3db7..97a8a0a52cff 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -2051,99 +2051,6 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, return true; } -/// Generic routine to convert an SSA value to a value of a different -/// type. -/// -/// This will try various different casting techniques, such as bitcasts, -/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test -/// two types for viability with this routine. -static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, - Type *NewTy) { - Type *OldTy = V->getType(); - -#ifndef NDEBUG - BasicBlock *BB = IRB.GetInsertBlock(); - assert(BB && BB->getParent() && "VScale unknown!"); - unsigned VScale = BB->getParent()->getVScaleValue(); - assert(canConvertValue(DL, OldTy, NewTy, VScale) && - "Value not convertable to type"); -#endif - - if (OldTy == NewTy) - return V; - - assert(!(isa(OldTy) && isa(NewTy)) && - "Integer types must be the exact same to convert."); - - // A variant of bitcast that supports a mixture of fixed and scalable types - // that are know to have the same size. - auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * { - Type *InTy = In->getType(); - if (InTy == Ty) - return In; - - if (isa(InTy) && isa(Ty)) { - // For vscale_range(2) expand <4 x i32> to --> - // <4 x i32> to to - auto *VTy = VectorType::getWithSizeAndScalar(cast(Ty), InTy); - return IRB.CreateBitCast(IRB.CreateInsertVector(VTy, - PoisonValue::get(VTy), In, - IRB.getInt64(0)), - Ty); - } - - if (isa(InTy) && isa(Ty)) { - // For vscale_range(2) expand to <4 x i32> --> - // to to <4 x i32> - auto *VTy = VectorType::getWithSizeAndScalar(cast(InTy), Ty); - return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy), - IRB.getInt64(0)); - } - - return IRB.CreateBitCast(In, Ty); - }; - - // See if we need inttoptr for this type pair. May require additional bitcast. - if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { - // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* - // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> - // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> - // Directly handle i64 to i8* - return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), - NewTy); - } - - // See if we need ptrtoint for this type pair. May require additional bitcast. - if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { - // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 - // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> - // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> - // Expand i8* to i64 --> i8* to i64 to i64 - return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), - NewTy); - } - - if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) { - unsigned OldAS = OldTy->getPointerAddressSpace(); - unsigned NewAS = NewTy->getPointerAddressSpace(); - // To convert pointers with different address spaces (they are already - // checked convertible, i.e. they have the same pointer size), so far we - // cannot use `bitcast` (which has restrict on the same address space) or - // `addrspacecast` (which is not always no-op casting). Instead, use a pair - // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit - // size. - if (OldAS != NewAS) { - assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); - return IRB.CreateIntToPtr( - CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), - DL.getIntPtrType(NewTy)), - NewTy); - } - } - - return CreateBitCastLike(V, NewTy); -} - /// Test whether the given slice use can be promoted to a vector. /// /// This function is called to test each entry in a partition which is slated @@ -3243,7 +3150,7 @@ private: assert(!LI.isVolatile()); Value *V = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load"); - V = convertValue(DL, IRB, V, IntTy); + V = IRB.CreateBitPreservingCastChain(DL, V, IntTy); assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) { @@ -3336,7 +3243,7 @@ private: V = NewLI; IsPtrAdjusted = true; } - V = convertValue(DL, IRB, V, TargetTy); + V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy); if (IsSplit) { assert(!LI.isVolatile()); @@ -3391,7 +3298,7 @@ private: ? ElementTy : FixedVectorType::get(ElementTy, NumElements); if (V->getType() != SliceTy) - V = convertValue(DL, IRB, V, SliceTy); + V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy); // Mix in the existing elements. Value *Old = @@ -3420,12 +3327,12 @@ private: IntTy->getBitWidth()) { Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy); assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset"); uint64_t Offset = BeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert"); } - V = convertValue(DL, IRB, V, NewAllocaTy); + V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy); StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign()); Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access, LLVMContext::MD_access_group}); @@ -3477,7 +3384,7 @@ private: if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && canConvertValue(DL, V->getType(), NewAllocaTy)) { - V = convertValue(DL, IRB, V, NewAllocaTy); + V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy); Value *NewPtr = getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile()); @@ -3628,7 +3535,7 @@ private: Value *Splat = getIntegerSplat( II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8); - Splat = convertValue(DL, IRB, Splat, ElementTy); + Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy); if (NumElements > 1) Splat = getVectorSplat(Splat, NumElements); @@ -3647,14 +3554,14 @@ private: EndOffset != NewAllocaBeginOffset)) { Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; V = insertInteger(DL, IRB, Old, V, Offset, "insert"); } else { assert(V->getType() == IntTy && "Wrong type for an alloca wide integer!"); } - V = convertValue(DL, IRB, V, NewAllocaTy); + V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy); } else { // Established these invariants above. assert(NewBeginOffset == NewAllocaBeginOffset); @@ -3666,7 +3573,7 @@ private: V = getVectorSplat( V, cast(AllocaVecTy)->getNumElements()); - V = convertValue(DL, IRB, V, NewAllocaTy); + V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy); } Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile()); @@ -3868,7 +3775,7 @@ private: } else if (IntTy && !IsWholeAlloca && !IsDest) { Src = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load"); - Src = convertValue(DL, IRB, Src, IntTy); + Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract"); } else { @@ -3889,10 +3796,10 @@ private: } else if (IntTy && !IsWholeAlloca && IsDest) { Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "oldload"); - Old = convertValue(DL, IRB, Old, IntTy); + Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy); uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset; Src = insertInteger(DL, IRB, Old, Src, Offset, "insert"); - Src = convertValue(DL, IRB, Src, NewAllocaTy); + Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy); } StoreInst *Store = cast( diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll index 015ce256a80c..598a954da056 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll @@ -9,9 +9,9 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0 ; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0 +; CHECK-NEXT: [[PROMOTEALLOCA2:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA2]], i64 68, i32 0 ; CHECK-NEXT: [[TMP3]] = insertelement <3 x i64> [[TMP2]], i64 32, i32 0 ; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 68 ; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] @@ -67,9 +67,9 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0 ; CHECK-NEXT: br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0 -; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1 +; CHECK-NEXT: [[PROMOTEALLOCA2:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA2]], i32 0 +; CHECK-NEXT: [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA2]], i64 32, i32 1 ; CHECK-NEXT: [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 32 ; CHECK-NEXT: br i1 [[LOOP_CC]], label [[LOOP]], label [[END]] ; CHECK: end: @@ -192,6 +192,57 @@ entry: ret void } +define void @alloca_load_store_ptr_ptrvec(ptr %arg) { +; CHECK-LABEL: define void @alloca_load_store_ptr_ptrvec +; CHECK-SAME: (ptr [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <2 x ptr addrspace(3)> poison +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i32> [[TMP1]] to <2 x ptr addrspace(3)> +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca <2 x ptr addrspace(3)>, align 8, addrspace(5) + store ptr %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load ptr, ptr addrspace(5) %alloca, align 8 + ret void +} + +define <2 x ptr> @alloca_load_store_diff_size_ptrvecs1(<2 x ptr> %arg) { +; CHECK-LABEL: define <2 x ptr> @alloca_load_store_diff_size_ptrvecs1 +; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <4 x ptr addrspace(3)> poison +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)> +; CHECK-NEXT: ret <2 x ptr> [[ARG]] +; +entry: + %alloca = alloca <4 x ptr addrspace(3)>, align 8, addrspace(5) + store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load <2 x ptr>, ptr addrspace(5) %alloca, align 8 + ret <2 x ptr> %tmp +} + +define <4 x ptr addrspace(3)> @alloca_load_store_diff_size_ptrvecs2(<4 x ptr addrspace(3)> %arg) { +; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_diff_size_ptrvecs2 +; CHECK-SAME: (<4 x ptr addrspace(3)> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ALLOCA:%.*]] = freeze <2 x ptr> poison +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[ARG]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x ptr> +; CHECK-NEXT: ret <4 x ptr addrspace(3)> [[ARG]] +; +entry: + %alloca = alloca <2 x ptr>, align 8, addrspace(5) + store <4 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8 + %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8 + ret <4 x ptr addrspace(3)> %tmp +} + ; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer. define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) { ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec