diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 9ad9e59d9cea..02c17575469a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2294,6 +2294,13 @@ public:
   /// not specified.
   LLVM_ABI Value *CreateAggregateCast(Value *V, Type *DestTy);
 
+  /// Create a chain of casts to convert V to NewTy, preserving the bit pattern
+  /// of V. This may involve multiple casts (e.g., ptr -> i64 -> <2 x i32>).
+  /// The created cast instructions are inserted into the current basic block.
+  /// If no casts are needed, V is returned.
+  LLVM_ABI Value *CreateBitPreservingCastChain(const DataLayout &DL, Value *V,
+                                               Type *NewTy);
+
   //===--------------------------------------------------------------------===//
   // Instruction creation methods: Compare Instructions
   //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index c87fd1b444c0..fb282441b908 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -103,6 +103,79 @@ Value *IRBuilderBase::CreateAggregateCast(Value *V, Type *DestTy) {
   return CreateBitOrPointerCast(V, DestTy);
 }
 
+Value *IRBuilderBase::CreateBitPreservingCastChain(const DataLayout &DL,
+                                                   Value *V, Type *NewTy) {
+  Type *OldTy = V->getType();
+
+  if (OldTy == NewTy)
+    return V;
+
+  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+         "Integer types must be the exact same to convert.");
+
+  // A variant of bitcast that supports a mixture of fixed and scalable types
+  // that are know to have the same size.
+  auto CreateBitCastLike = [this](Value *In, Type *Ty) -> Value * {
+    Type *InTy = In->getType();
+    if (InTy == Ty)
+      return In;
+
+    if (isa<FixedVectorType>(InTy) && isa<ScalableVectorType>(Ty)) {
+      // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
+      //   <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
+      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(Ty), InTy);
+      return CreateBitCast(
+          CreateInsertVector(VTy, PoisonValue::get(VTy), In, getInt64(0)), Ty);
+    }
+
+    if (isa<ScalableVectorType>(InTy) && isa<FixedVectorType>(Ty)) {
+      // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
+      //   <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
+      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(InTy), Ty);
+      return CreateExtractVector(Ty, CreateBitCast(In, VTy), getInt64(0));
+    }
+
+    return CreateBitCast(In, Ty);
+  };
+
+  // See if we need inttoptr for this type pair. May require additional bitcast.
+  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
+    // Directly handle i64 to i8*
+    return CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. May require additional bitcast.
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
+    // Expand i8* to i64 --> i8* to i64 to i64
+    return CreateBitCastLike(CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy);
+  }
+
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    unsigned OldAS = OldTy->getPointerAddressSpace();
+    unsigned NewAS = NewTy->getPointerAddressSpace();
+    // To convert pointers with different address spaces (they are already
+    // checked convertible, i.e. they have the same pointer size), so far we
+    // cannot use `bitcast` (which has restrict on the same address space) or
+    // `addrspacecast` (which is not always no-op casting). Instead, use a pair
+    // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
+    // size.
+    if (OldAS != NewAS) {
+      return CreateIntToPtr(
+          CreateBitCastLike(CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                            DL.getIntPtrType(NewTy)),
+          NewTy);
+    }
+  }
+
+  return CreateBitCastLike(V, NewTy);
+}
+
 CallInst *
 IRBuilderBase::createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
                                 const Twine &Name, FMFSource FMFSource,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index d18d3a13b29e..ed676c3fde2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -606,21 +606,6 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
                                         InstSimplifyFolder(DL));
   Builder.SetInsertPoint(Inst);
 
-  const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
-                                                   Type *PtrTy) -> Value * {
-    assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
-    const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
-    if (!PtrTy->isVectorTy())
-      return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
-    const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
-    // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
-    // first cast the ptr vector to <2 x i64>.
-    assert((Size % NumPtrElts == 0) && "Vector size not divisble");
-    Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
-    return Builder.CreateBitOrPointerCast(
-        Val, FixedVectorType::get(EltTy, NumPtrElts));
-  };
-
   Type *VecEltTy = AA.Vector.Ty->getElementType();
 
   switch (Inst->getOpcode()) {
@@ -634,12 +619,8 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
     if (Constant *CI = dyn_cast<Constant>(Index)) {
       if (CI->isZeroValue() && AccessSize == VecStoreSize) {
-        if (AccessTy->isPtrOrPtrVectorTy())
-          CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
-        else if (CurVal->getType()->isPtrOrPtrVectorTy())
-          CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
-        Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
-        Inst->replaceAllUsesWith(NewVal);
+        Inst->replaceAllUsesWith(
+            Builder.CreateBitPreservingCastChain(DL, CurVal, AccessTy));
         return nullptr;
       }
     }
@@ -689,13 +670,8 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
             SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
       }
 
-      if (AccessTy->isPtrOrPtrVectorTy())
-        SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
-      else if (SubVecTy->isPtrOrPtrVectorTy())
-        SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
-
-      SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
-      Inst->replaceAllUsesWith(SubVec);
+      Inst->replaceAllUsesWith(
+          Builder.CreateBitPreservingCastChain(DL, SubVec, AccessTy));
       return nullptr;
     }
 
@@ -719,15 +695,9 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
     // We're storing the full vector, we can handle this without knowing CurVal.
     Type *AccessTy = Val->getType();
     TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
-    if (Constant *CI = dyn_cast<Constant>(Index)) {
-      if (CI->isZeroValue() && AccessSize == VecStoreSize) {
-        if (AccessTy->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, AccessTy);
-        else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
-          Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
-        return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
-      }
-    }
+    if (Constant *CI = dyn_cast<Constant>(Index))
+      if (CI->isZeroValue() && AccessSize == VecStoreSize)
+        return Builder.CreateBitPreservingCastChain(DL, Val, AA.Vector.Ty);
 
     // Storing a subvector.
     if (isa<FixedVectorType>(AccessTy)) {
@@ -738,13 +708,7 @@ static Value *promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL,
       auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
       assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
 
-      if (SubVecTy->isPtrOrPtrVectorTy())
-        Val = CreateTempPtrIntCast(Val, SubVecTy);
-      else if (AccessTy->isPtrOrPtrVectorTy())
-        Val = CreateTempPtrIntCast(Val, AccessTy);
-
-      Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
-
+      Val = Builder.CreateBitPreservingCastChain(DL, Val, SubVecTy);
       Value *CurVec = GetCurVal();
       for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
            K < NumElts; ++K) {
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 83eabdae3db7..97a8a0a52cff 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2051,99 +2051,6 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy,
   return true;
 }
 
-/// Generic routine to convert an SSA value to a value of a different
-/// type.
-///
-/// This will try various different casting techniques, such as bitcasts,
-/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
-/// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *NewTy) {
-  Type *OldTy = V->getType();
-
-#ifndef NDEBUG
-  BasicBlock *BB = IRB.GetInsertBlock();
-  assert(BB && BB->getParent() && "VScale unknown!");
-  unsigned VScale = BB->getParent()->getVScaleValue();
-  assert(canConvertValue(DL, OldTy, NewTy, VScale) &&
-         "Value not convertable to type");
-#endif
-
-  if (OldTy == NewTy)
-    return V;
-
-  assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
-         "Integer types must be the exact same to convert.");
-
-  // A variant of bitcast that supports a mixture of fixed and scalable types
-  // that are know to have the same size.
-  auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * {
-    Type *InTy = In->getType();
-    if (InTy == Ty)
-      return In;
-
-    if (isa<FixedVectorType>(InTy) && isa<ScalableVectorType>(Ty)) {
-      // For vscale_range(2) expand <4 x i32> to <vscale x 4 x i16> -->
-      //   <4 x i32> to <vscale x 2 x i32> to <vscale x 4 x i16>
-      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(Ty), InTy);
-      return IRB.CreateBitCast(IRB.CreateInsertVector(VTy,
-                                                      PoisonValue::get(VTy), In,
-                                                      IRB.getInt64(0)),
-                               Ty);
-    }
-
-    if (isa<ScalableVectorType>(InTy) && isa<FixedVectorType>(Ty)) {
-      // For vscale_range(2) expand <vscale x 4 x i16> to <4 x i32> -->
-      //   <vscale x 4 x i16> to <vscale x 2 x i32> to <4 x i32>
-      auto *VTy = VectorType::getWithSizeAndScalar(cast<VectorType>(InTy), Ty);
-      return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy),
-                                     IRB.getInt64(0));
-    }
-
-    return IRB.CreateBitCast(In, Ty);
-  };
-
-  // See if we need inttoptr for this type pair. May require additional bitcast.
-  if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
-    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
-    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
-    // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
-    // Directly handle i64 to i8*
-    return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)),
-                              NewTy);
-  }
-
-  // See if we need ptrtoint for this type pair. May require additional bitcast.
-  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
-    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
-    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
-    // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
-    // Expand i8* to i64 --> i8* to i64 to i64
-    return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
-                             NewTy);
-  }
-
-  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
-    unsigned OldAS = OldTy->getPointerAddressSpace();
-    unsigned NewAS = NewTy->getPointerAddressSpace();
-    // To convert pointers with different address spaces (they are already
-    // checked convertible, i.e. they have the same pointer size), so far we
-    // cannot use `bitcast` (which has restrict on the same address space) or
-    // `addrspacecast` (which is not always no-op casting). Instead, use a pair
-    // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
-    // size.
-    if (OldAS != NewAS) {
-      assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
-      return IRB.CreateIntToPtr(
-          CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
-                            DL.getIntPtrType(NewTy)),
-          NewTy);
-    }
-  }
-
-  return CreateBitCastLike(V, NewTy);
-}
-
 /// Test whether the given slice use can be promoted to a vector.
 ///
 /// This function is called to test each entry in a partition which is slated
@@ -3243,7 +3150,7 @@ private:
     assert(!LI.isVolatile());
     Value *V =
         IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
-    V = convertValue(DL, IRB, V, IntTy);
+    V = IRB.CreateBitPreservingCastChain(DL, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
     if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
@@ -3336,7 +3243,7 @@ private:
       V = NewLI;
       IsPtrAdjusted = true;
     }
-    V = convertValue(DL, IRB, V, TargetTy);
+    V = IRB.CreateBitPreservingCastChain(DL, V, TargetTy);
 
     if (IsSplit) {
       assert(!LI.isVolatile());
@@ -3391,7 +3298,7 @@ private:
                           ? ElementTy
                           : FixedVectorType::get(ElementTy, NumElements);
       if (V->getType() != SliceTy)
-        V = convertValue(DL, IRB, V, SliceTy);
+        V = IRB.CreateBitPreservingCastChain(DL, V, SliceTy);
 
       // Mix in the existing elements.
       Value *Old =
@@ -3420,12 +3327,12 @@ private:
         IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
                                          "oldload");
-      Old = convertValue(DL, IRB, Old, IntTy);
+      Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
     }
-    V = convertValue(DL, IRB, V, NewAllocaTy);
+    V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
@@ -3477,7 +3384,7 @@ private:
     if (NewBeginOffset == NewAllocaBeginOffset &&
         NewEndOffset == NewAllocaEndOffset &&
         canConvertValue(DL, V->getType(), NewAllocaTy)) {
-      V = convertValue(DL, IRB, V, NewAllocaTy);
+      V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
       Value *NewPtr =
           getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());
 
@@ -3628,7 +3535,7 @@ private:
 
       Value *Splat = getIntegerSplat(
           II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedValue() / 8);
-      Splat = convertValue(DL, IRB, Splat, ElementTy);
+      Splat = IRB.CreateBitPreservingCastChain(DL, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
@@ -3647,14 +3554,14 @@ private:
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI,
                                            NewAI.getAlign(), "oldload");
-        Old = convertValue(DL, IRB, Old, IntTy);
+        Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
       }
-      V = convertValue(DL, IRB, V, NewAllocaTy);
+      V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
     } else {
       // Established these invariants above.
       assert(NewBeginOffset == NewAllocaBeginOffset);
@@ -3666,7 +3573,7 @@ private:
         V = getVectorSplat(
             V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
 
-      V = convertValue(DL, IRB, V, NewAllocaTy);
+      V = IRB.CreateBitPreservingCastChain(DL, V, NewAllocaTy);
     }
 
     Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
@@ -3868,7 +3775,7 @@ private:
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src =
           IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(), "load");
-      Src = convertValue(DL, IRB, Src, IntTy);
+      Src = IRB.CreateBitPreservingCastChain(DL, Src, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
@@ -3889,10 +3796,10 @@ private:
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(NewAllocaTy, &NewAI, NewAI.getAlign(),
                                          "oldload");
-      Old = convertValue(DL, IRB, Old, IntTy);
+      Old = IRB.CreateBitPreservingCastChain(DL, Old, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
-      Src = convertValue(DL, IRB, Src, NewAllocaTy);
+      Src = IRB.CreateBitPreservingCastChain(DL, Src, NewAllocaTy);
     }
 
     StoreInst *Store = cast<StoreInst>(
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
index 015ce256a80c..598a954da056 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll
@@ -9,9 +9,9 @@ define amdgpu_kernel void @test_overwrite(i64 %val, i1 %cond) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 68, i32 0
+; CHECK-NEXT:    [[PROMOTEALLOCA2:%.*]] = phi <3 x i64> [ [[TMP3:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA2]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i64> [[PROMOTEALLOCA2]], i64 68, i32 0
 ; CHECK-NEXT:    [[TMP3]] = insertelement <3 x i64> [[TMP2]], i64 32, i32 0
 ; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 68
 ; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
@@ -67,9 +67,9 @@ define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <3 x i64> [[STACK]], i64 43, i32 0
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP:%.*]], label [[END:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PROMOTEALLOCA1:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA1]], i32 0
-; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA1]], i64 32, i32 1
+; CHECK-NEXT:    [[PROMOTEALLOCA2:%.*]] = phi <3 x i64> [ [[TMP2:%.*]], [[LOOP]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x i64> [[PROMOTEALLOCA2]], i32 0
+; CHECK-NEXT:    [[TMP2]] = insertelement <3 x i64> [[PROMOTEALLOCA2]], i64 32, i32 1
 ; CHECK-NEXT:    [[LOOP_CC:%.*]] = icmp ne i64 [[TMP1]], 32
 ; CHECK-NEXT:    br i1 [[LOOP_CC]], label [[LOOP]], label [[END]]
 ; CHECK:       end:
@@ -192,6 +192,57 @@ entry:
   ret void
 }
 
+define void @alloca_load_store_ptr_ptrvec(ptr %arg) {
+; CHECK-LABEL: define void @alloca_load_store_ptr_ptrvec
+; CHECK-SAME: (ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <2 x ptr addrspace(3)> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[ARG]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <2 x i32> [[TMP1]] to <2 x ptr addrspace(3)>
+; CHECK-NEXT:    ret void
+;
+entry:
+  %alloca = alloca <2 x ptr addrspace(3)>, align 8, addrspace(5)
+  store ptr %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load ptr, ptr addrspace(5) %alloca, align 8
+  ret void
+}
+
+define <2 x ptr> @alloca_load_store_diff_size_ptrvecs1(<2 x ptr> %arg) {
+; CHECK-LABEL: define <2 x ptr> @alloca_load_store_diff_size_ptrvecs1
+; CHECK-SAME: (<2 x ptr> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <4 x ptr addrspace(3)> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <2 x ptr> [[ARG]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[TMP0]] to <4 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <4 x i32> [[TMP1]] to <4 x ptr addrspace(3)>
+; CHECK-NEXT:    ret <2 x ptr> [[ARG]]
+;
+entry:
+  %alloca = alloca <4 x ptr addrspace(3)>, align 8, addrspace(5)
+  store <2 x ptr> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <2 x ptr>, ptr addrspace(5) %alloca, align 8
+  ret <2 x ptr> %tmp
+}
+
+define <4 x ptr addrspace(3)> @alloca_load_store_diff_size_ptrvecs2(<4 x ptr addrspace(3)> %arg) {
+; CHECK-LABEL: define <4 x ptr addrspace(3)> @alloca_load_store_diff_size_ptrvecs2
+; CHECK-SAME: (<4 x ptr addrspace(3)> [[ARG:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ALLOCA:%.*]] = freeze <2 x ptr> poison
+; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[ARG]] to <4 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x ptr>
+; CHECK-NEXT:    ret <4 x ptr addrspace(3)> [[ARG]]
+;
+entry:
+  %alloca = alloca <2 x ptr>, align 8, addrspace(5)
+  store <4 x ptr addrspace(3)> %arg, ptr addrspace(5) %alloca, align 8
+  %tmp = load <4 x ptr addrspace(3)>, ptr addrspace(5) %alloca, align 8
+  ret <4 x ptr addrspace(3)> %tmp
+}
+
 ; Will not vectorize because we're accessing a 64 bit vector with a 32 bits pointer.
 define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec(ptr addrspace(3) %arg) {
 ; CHECK-LABEL: define ptr addrspace(3) @alloca_load_store_ptr_mixed_full_ivec