AMDGPU: Convert promote alloca tests to opaque pointers

2022-11-28 10:36:38 -05:00 · 2022-11-28 10:36:38 -05:00 · 50caf6936b
commit 50caf6936b
parent b3df889b71
22 changed files with 1044 additions and 1181 deletions
--- a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@ -6,45 +6,43 @@
 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4

-define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4

-define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

@ -52,69 +50,66 @@ entry:
 ; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
 ; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4

-define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

 ; ALL-LABEL: @occupancy_0(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

 ; ALL-LABEL: @occupancy_max(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

@ -122,25 +117,24 @@ entry:
 ; CI-LABEL: @occupancy_6(
 ; SI: alloca
 ; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
 entry:
  %stack = alloca [42 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

@ -148,25 +142,24 @@ entry:
 ; SICI: alloca [43 x i8]
 ; GFX10PLUS-NOT: alloca

-define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
 entry:
  %stack = alloca [43 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

@ -174,25 +167,24 @@ entry:
 ; CI-LABEL: @occupancy_8(
 ; SI: alloca
 ; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
 entry:
  %stack = alloca [32 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

@ -200,25 +192,24 @@ entry:
 ; SICI: alloca [33 x i8]
 ; GFX10PLUS-NOT: alloca

-define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
 entry:
  %stack = alloca [33 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

@ -226,25 +217,24 @@ entry:
 ; CI-LABEL: @occupancy_9(
 ; SI: alloca
 ; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
 entry:
  %stack = alloca [28 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

@ -252,25 +242,24 @@ entry:
 ; SICI: alloca [29 x i8]
 ; GFX10PLUS-NOT: alloca

-define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
 entry:
  %stack = alloca [29 x i8], align 4, addrspace(5)
-  %tmp = load i8, i8 addrspace(1)* %in, align 1
+  %tmp = load i8, ptr addrspace(1) %in, align 1
  %tmp4 = sext i8 %tmp to i64
-  %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
-  store i8 4, i8 addrspace(5)* %arrayidx1, align 1
-  %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
-  %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+  %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+  store i8 4, ptr addrspace(5) %arrayidx1, align 1
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+  %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
  %tmp5 = sext i8 %tmp1 to i64
-  %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
-  store i8 5, i8 addrspace(5)* %arrayidx3, align 1
-  %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 0
-  %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
-  store i8 %tmp2, i8 addrspace(1)* %out, align 1
-  %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 1
-  %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
-  %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
-  store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+  %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+  store i8 5, ptr addrspace(5) %arrayidx3, align 1
+  %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+  store i8 %tmp2, ptr addrspace(1) %out, align 1
+  %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+  %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+  %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+  store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@ -18,44 +18,40 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
 ; CHECK-LABEL: @promote_1d_aggr(
 ; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
-; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 1
-; CHECK-NEXT:    [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [[BLOCK]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 0
-; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], [1 x float] addrspace(1)* [[FOO2]], align 4
-; CHECK-NEXT:    store [1 x float] [[FOO3]], [1 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], [1 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[FOO6:%.*]] = load float, float addrspace(5)* [[FOO5]], align 4
+; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
+; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
+; CHECK-NEXT:    store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
 ; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
+; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
 ; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
 ; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
 ; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
-; CHECK-NEXT:    [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT:    store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
+; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [1 x float], addrspace(5)
-  %foo = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1
-  %foo1 = load i32, i32 addrspace(1)* %foo
-  store i32 %foo1, i32 addrspace(5)* %i
-  %foo2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0
-  %foo3 = load [1 x float], [1 x float] addrspace(1)* %foo2
-  store [1 x float] %foo3, [1 x float] addrspace(5)* %f1
-  %foo4 = load i32, i32 addrspace(5)* %i
-  %foo5 = getelementptr [1 x float], [1 x float] addrspace(5)* %f1, i32 0, i32 %foo4
-  %foo6 = load float, float addrspace(5)* %foo5
+  %foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
+  %foo1 = load i32, ptr addrspace(1) %foo
+  store i32 %foo1, ptr addrspace(5) %i
+  %foo3 = load [1 x float], ptr addrspace(1) @block
+  store [1 x float] %foo3, ptr addrspace(5) %f1
+  %foo4 = load i32, ptr addrspace(5) %i
+  %foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
+  %foo6 = load float, ptr addrspace(5) %foo5
  %foo7 = alloca <4 x float>, addrspace(5)
-  %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
+  %foo8 = load <4 x float>, ptr addrspace(5) %foo7
  %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
  %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
  %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
  %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
-  %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
-  store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
+  store <4 x float> %foo12, ptr addrspace(1) @pv
  ret void
 }

@ -66,44 +62,36 @@ define amdgpu_vs void @promote_store_aggr() #0 {
 ; CHECK-LABEL: @promote_store_aggr(
 ; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
-; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK2:%.*]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 0
-; CHECK-NEXT:    [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
+; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
 ; CHECK-NEXT:    [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
-; CHECK-NEXT:    [[FOO4:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0
-; CHECK-NEXT:    store <2 x float> [[TMP3]], <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP4]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1
-; CHECK-NEXT:    store <2 x float> [[TMP6]], <2 x float> addrspace(5)* [[TMP4]], align 8
-; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], [2 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1
-; CHECK-NEXT:    store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4
-; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* [[FOO8]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT:    [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
+; CHECK-NEXT:    store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT:    store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [2 x float], addrspace(5)
-  %foo = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0
-  %foo1 = load i32, i32 addrspace(1)* %foo
-  store i32 %foo1, i32 addrspace(5)* %i
-  %foo2 = load i32, i32 addrspace(5)* %i
+  %foo1 = load i32, ptr addrspace(1) @block2
+  store i32 %foo1, ptr addrspace(5) %i
+  %foo2 = load i32, ptr addrspace(5) %i
  %foo3 = sitofp i32 %foo2 to float
-  %foo4 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 0
-  store float %foo3, float addrspace(5)* %foo4
-  %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 1
-  store float 2.000000e+00, float addrspace(5)* %foo5
-  %foo6 = load [2 x float], [2 x float] addrspace(5)* %f1
-  %foo7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1
-  store [2 x float] %foo6, [2 x float] addrspace(1)* %foo7
-  %foo8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
-  store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* %foo8
+  store float %foo3, ptr addrspace(5) %f1
+  %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
+  store float 2.000000e+00, ptr addrspace(5) %foo5
+  %foo6 = load [2 x float], ptr addrspace(5) %f1
+  %foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
+  store [2 x float] %foo6, ptr addrspace(1) %foo7
+  store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
  ret void
 }

@ -114,46 +102,41 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-LABEL: @promote_load_from_store_aggr(
 ; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 ; CHECK-NEXT:    [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
-; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 1
-; CHECK-NEXT:    [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT:    store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr [[BLOCK3]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 0
-; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], [2 x float] addrspace(1)* [[FOO2]], align 4
-; CHECK-NEXT:    store [2 x float] [[FOO3]], [2 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT:    [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]]
+; CHECK-NEXT:    [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
+; CHECK-NEXT:    [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT:    store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
+; CHECK-NEXT:    store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT:    [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT:    [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
 ; CHECK-NEXT:    [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
-; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0
-; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
-; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
-; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
-; CHECK-NEXT:    [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT:    store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
+; CHECK-NEXT:    [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
+; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
+; CHECK-NEXT:    [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
+; CHECK-NEXT:    [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
+; CHECK-NEXT:    store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
 ; CHECK-NEXT:    ret void
 ;
  %i = alloca i32, addrspace(5)
  %f1 = alloca [2 x float], addrspace(5)
-  %foo = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1
-  %foo1 = load i32, i32 addrspace(1)* %foo
-  store i32 %foo1, i32 addrspace(5)* %i
-  %foo2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0
-  %foo3 = load [2 x float], [2 x float] addrspace(1)* %foo2
-  store [2 x float] %foo3, [2 x float] addrspace(5)* %f1
-  %foo4 = load i32, i32 addrspace(5)* %i
-  %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 %foo4
-  %foo6 = load float, float addrspace(5)* %foo5
+  %foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
+  %foo1 = load i32, ptr addrspace(1) %foo
+  store i32 %foo1, ptr addrspace(5) %i
+  %foo3 = load [2 x float], ptr addrspace(1) @block3
+  store [2 x float] %foo3, ptr addrspace(5) %f1
+  %foo4 = load i32, ptr addrspace(5) %i
+  %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
+  %foo6 = load float, ptr addrspace(5) %foo5
  %foo7 = alloca <4 x float>, addrspace(5)
-  %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
+  %foo8 = load <4 x float>, ptr addrspace(5) %foo7
  %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
  %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
  %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
  %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
-  %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
-  store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
+  store <4 x float> %foo12, ptr addrspace(1) @pv
  ret void
 }

@ -163,70 +146,61 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 define amdgpu_ps void @promote_double_aggr() #0 {
 ; CHECK-LABEL: @promote_double_aggr(
 ; CHECK-NEXT:    [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
-; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[FOO1:%.*]] = load double, double addrspace(1)* [[FOO]], align 8
-; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
-; CHECK-NEXT:    [[FOO3:%.*]] = load double, double addrspace(1)* [[FOO2]], align 8
+; CHECK-NEXT:    [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
+; CHECK-NEXT:    [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
+; CHECK-NEXT:    [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
 ; CHECK-NEXT:    [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
 ; CHECK-NEXT:    [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT:    store [2 x double] [[FOO5]], [2 x double] addrspace(5)* [[S]], align 8
-; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP1]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
-; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP4]], align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
-; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[FOO11:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT:    [[TMP8:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0
-; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double> addrspace(5)* [[TMP7]], align 16
-; CHECK-NEXT:    [[FOO12:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
-; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP13]], align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1
-; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT:    [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
+; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT:    [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
+; CHECK-NEXT:    [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
 ; CHECK-NEXT:    [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
 ; CHECK-NEXT:    [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
 ; CHECK-NEXT:    [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
 ; CHECK-NEXT:    [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
 ; CHECK-NEXT:    [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
-; CHECK-NEXT:    store <4 x float> [[FOO21]], <4 x float> addrspace(1)* @frag_color, align 16
+; CHECK-NEXT:    store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
 ; CHECK-NEXT:    ret void
 ;
  %s = alloca [2 x double], addrspace(5)
-  %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
-  %foo1 = load double, double addrspace(1)* %foo
-  %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
-  %foo3 = load double, double addrspace(1)* %foo2
+  %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
+  %foo1 = load double, ptr addrspace(1) %foo
+  %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
+  %foo3 = load double, ptr addrspace(1) %foo2
  %foo4 = insertvalue [2 x double] undef, double %foo1, 0
  %foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
-  store [2 x double] %foo5, [2 x double] addrspace(5)* %s
-  %foo6 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
-  %foo7 = load double, double addrspace(5)* %foo6
-  %foo8 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
-  %foo9 = load double, double addrspace(5)* %foo8
+  store [2 x double] %foo5, ptr addrspace(5) %s
+  %foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+  %foo7 = load double, ptr addrspace(5) %foo6
+  %foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+  %foo9 = load double, ptr addrspace(5) %foo8
  %foo10 = fadd double %foo7, %foo9
-  %foo11 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
-  store double %foo10, double addrspace(5)* %foo11
-  %foo12 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
-  %foo13 = load double, double addrspace(5)* %foo12
-  %foo14 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
-  %foo15 = load double, double addrspace(5)* %foo14
+  store double %foo10, ptr addrspace(5) %s
+  %foo13 = load double, ptr addrspace(5) %s
+  %foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+  %foo15 = load double, ptr addrspace(5) %foo14
  %foo16 = fadd double %foo13, %foo15
  %foo17 = fptrunc double %foo16 to float
  %foo18 = insertelement <4 x float> undef, float %foo17, i32 0
  %foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
  %foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
  %foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
-  store <4 x float> %foo21, <4 x float> addrspace(1)* @frag_color
+  store <4 x float> %foo21, ptr addrspace(1) @frag_color
  ret void
 }

@ -234,22 +208,21 @@ define amdgpu_ps void @promote_double_aggr() #0 {
 define amdgpu_kernel void @alloca_struct() #0 {
 ; CHECK-LABEL: @alloca_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)*
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4, !invariant.load !0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32 addrspace(4)* [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT:    [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], [1024 x [2 x %struct]] addrspace(3)* @alloca_struct.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
 ; CHECK-NEXT:    ret void
 ;
 entry:
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@ -5,45 +5,43 @@

 ; CHECK-LABEL: @array_alloca(
 ; CHECK: %stack = alloca i32, i32 5, align 4, addrspace(5)
-define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
  %stack = alloca i32, i32 5, align 4, addrspace(5)
-  %ld0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
-  %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %ld2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
-  %ld3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %ld3, i32 addrspace(1)* %arrayidx13
+  %ld0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %ld2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %ld2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
+  %ld3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %ld3, ptr addrspace(1) %arrayidx13
  ret void
 }

 ; CHECK-LABEL: @array_alloca_dynamic(
 ; CHECK: %stack = alloca i32, i32 %size, align 4, addrspace(5)
-define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %size) #0 {
 entry:
  %stack = alloca i32, i32 %size, align 4, addrspace(5)
-  %ld0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
-  %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %ld2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
-  %ld3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %ld3, i32 addrspace(1)* %arrayidx13
+  %ld0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %ld2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %ld2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
+  %ld3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %ld3, ptr addrspace(1) %arrayidx13
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll
@ -1,28 +1,27 @@
 ; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=IR %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=ASM %s

-; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
 ; IR: alloca [5 x i32]

 ; ASM-LABEL: {{^}}promote_alloca_shaders:
 ; ASM: ; ScratchSize: 24
-define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx4, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32 addrspace(5)* %arrayidx5
-  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx6
+  %tmp0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
+  %arrayidx5 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %tmp3 = load i32, ptr addrspace(5) %arrayidx5
+  %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp3, ptr addrspace(1) %arrayidx6
  ret void
 }

@ -33,18 +32,17 @@ entry:
 ; ASM-LABEL: {{^}}promote_to_vector_call_c:
 ; ASM-NOT: LDSByteSize
 ; ASM: ; ScratchSize: 12
-define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
+define void @promote_to_vector_call_c(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
  %tmp = alloca [2 x i32], addrspace(5)
-  %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  store i32 0, i32 addrspace(5)* %tmp1
-  store i32 1, i32 addrspace(5)* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
-  %tmp4 = load i32, i32 addrspace(5)* %tmp3
-  %tmp5 = load volatile i32, i32 addrspace(1)* undef
+  %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %tmp2
+  %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+  %tmp4 = load i32, ptr addrspace(5) %tmp3
+  %tmp5 = load volatile i32, ptr addrspace(1) undef
  %tmp6 = add i32 %tmp4, %tmp5
-  store i32 %tmp6, i32 addrspace(1)* %out
+  store i32 %tmp6, ptr addrspace(1) %out
  ret void
 }

@ -54,43 +52,41 @@ entry:
 ; ASM-LABEL: {{^}}no_promote_to_lds_c:
 ; ASM-NOT: LDSByteSize
 ; ASM: ; ScratchSize: 24
-define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define void @no_promote_to_lds_c(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %3, i32 addrspace(1)* %arrayidx13
+  %0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %3, ptr addrspace(1) %arrayidx13
  ret void
 }

-declare i32 @foo(i32 addrspace(5)*) #0
+declare i32 @foo(ptr addrspace(5)) #0

 ; ASM-LABEL: {{^}}call_private:
 ; ASM: buffer_store_dword
 ; ASM: buffer_store_dword
 ; ASM: s_swappc_b64
 ; ASM: ScratchSize: 16400
-define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
  %tmp = alloca [2 x i32], addrspace(5)
-  %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  store i32 0, i32 addrspace(5)* %tmp1
-  store i32 1, i32 addrspace(5)* %tmp2
-  %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
-  %val = call i32 @foo(i32 addrspace(5)* %tmp3)
-  store i32 %val, i32 addrspace(1)* %out
+  %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %tmp2
+  %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+  %val = call i32 @foo(ptr addrspace(5) %tmp3)
+  store i32 %val, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@ -1,23 +1,22 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mtriple=amdgcn-- -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 target datalayout = "A5"

-declare {}* @llvm.invariant.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
-declare void @llvm.invariant.end.p5i8({}*, i64, i8 addrspace(5)* nocapture) #0
-declare i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)*) #1
+declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture) #0
+declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture) #0
+declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5)) #1

 ; GCN-LABEL: {{^}}use_invariant_promotable_lds:
 ; GCN: buffer_load_dword
 ; GCN: ds_write_b32
-define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 {
 bb:
  %tmp = alloca i32, align 4, addrspace(5)
-  %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %tmp3 = load i32, i32 addrspace(1)* %tmp2
-  store i32 %tmp3, i32 addrspace(5)* %tmp
-  %tmp4 = call {}* @llvm.invariant.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) #0
-  call void @llvm.invariant.end.p5i8({}* %tmp4, i64 4, i8 addrspace(5)* %tmp1) #0
-  %tmp5 = call i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)* %tmp1) #1
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %tmp3 = load i32, ptr addrspace(1) %tmp2
+  store i32 %tmp3, ptr addrspace(5) %tmp
+  %tmp4 = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #0
+  call void @llvm.invariant.end.p5(ptr %tmp4, i64 4, ptr addrspace(5) %tmp) #0
+  %tmp5 = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %tmp) #1
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@ -2,22 +2,21 @@

 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"

-declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
-declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0
+declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0
+declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0

 ; OPT-LABEL: @use_lifetime_promotable_lds(
 ; OPT-NOT: alloca i32
 ; OPT-NOT: llvm.lifetime
-; OPT: store i32 %tmp3, i32 addrspace(3)*
-define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+; OPT: store i32 %tmp3, ptr addrspace(3)
+define amdgpu_kernel void @use_lifetime_promotable_lds(ptr addrspace(1) %arg) #2 {
 bb:
  %tmp = alloca i32, align 4, addrspace(5)
-  %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %tmp1)
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %tmp3 = load i32, i32 addrspace(1)* %tmp2
-  store i32 %tmp3, i32 addrspace(5)* %tmp
-  call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %tmp1)
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %tmp)
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %tmp3 = load i32, ptr addrspace(1) %tmp2
+  store i32 %tmp3, ptr addrspace(5) %tmp
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %tmp)
  ret void
 }

@ -29,7 +28,7 @@ bb:
 define amdgpu_kernel void @iterator_erased_lifetime() {
 entry:
  %alloca = alloca i8, align 1, addrspace(5)
-  call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %alloca)
+  call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca)
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@ -1,95 +1,77 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s

-declare void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
-declare void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
-declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
+declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
+declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0

-declare void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
-declare void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
-declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
+declare void @llvm.memmove.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
+declare void @llvm.memmove.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
+declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0

-declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i1) #0
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i1) #0

-declare i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)*, i1, i1, i1) #1
+declare i32 @llvm.objectsize.i32.p5(ptr addrspace(5), i1, i1, i1) #1

 ; CHECK-LABEL: @promote_with_memcpy(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
+; CHECK: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memcpy(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  %alloca = alloca [17 x i32], align 4, addrspace(5)
-  %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
-  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
-  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-  call void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
+  call void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
+  call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
  ret void
 }

 ; CHECK-LABEL: @promote_with_memmove(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memmove.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
+; CHECK: call void @llvm.memmove.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memmove(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  %alloca = alloca [17 x i32], align 4, addrspace(5)
-  %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
-  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
-  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-  call void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
+  call void @llvm.memmove.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
+  call void @llvm.memmove.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
  ret void
 }

 ; CHECK-LABEL: @promote_with_memset(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memset.p3.i32(ptr addrspace(3) align 4 [[GEP]], i8 7, i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memset(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
  %alloca = alloca [17 x i32], align 4, addrspace(5)
-  %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
-  %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
-  %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
-  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
+  call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 7, i32 68, i1 false)
  ret void
 }

 ; CHECK-LABEL: @promote_with_objectsize(
-; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false, i1 false)
-define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false)
+define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 {
  %alloca = alloca [17 x i32], align 4, addrspace(5)
-  %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
-  %size = call i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)* %alloca.bc, i1 false, i1 false, i1 false)
-  store i32 %size, i32 addrspace(1)* %out
+  %size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, i1 false, i1 false)
+  store i32 %size, ptr addrspace(1) %out
  ret void
 }

 ; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
-; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
-; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
-; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
 define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {
 entry:
  %r = alloca double, align 8, addrspace(5)
-  %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
-  %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
-  %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
-  %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
-  call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
+  %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
+  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
  ret void
 }

 ; CHECK-LABEL: @promote_alloca_used_twice_in_memmove(
-; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
-; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
-; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+; CHECK: call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
 define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) {
 entry:
  %r = alloca double, align 8, addrspace(5)
-  %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
-  %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
-  %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
-  %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
-  call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+  %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
+  %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
+  call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@ -5,32 +5,30 @@
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
 ; NOOPTS-NOT: ds_write
 ; OPTS: ds_write
-define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x [2 x i32]], addrspace(5)
-  %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
-  store i32 0, i32 addrspace(5)* %gep0
-  store i32 1, i32 addrspace(5)* %gep1
-  %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
-  %load = load i32, i32 addrspace(5)* %gep2
-  store i32 %load, i32 addrspace(1)* %out
+  %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  store i32 0, ptr addrspace(5) %alloca
+  store i32 1, ptr addrspace(5) %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, ptr addrspace(5) %gep2
+  store i32 %load, ptr addrspace(1) %out
  ret void
 }

 ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
 ; ALL: workgroup_group_segment_byte_size = 0{{$}}
 ; ALL-NOT: ds_write
-define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #1 {
 entry:
  %alloca = alloca [2 x [2 x i32]], addrspace(5)
-  %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
-  %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
-  store i32 0, i32 addrspace(5)* %gep0
-  store i32 1, i32 addrspace(5)* %gep1
-  %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
-  %load = load i32, i32 addrspace(5)* %gep2
-  store i32 %load, i32 addrspace(1)* %out
+  %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+  store i32 0, ptr addrspace(5) %alloca
+  store i32 1, ptr addrspace(5) %gep1
+  %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
+  %load = load i32, ptr addrspace(5) %gep2
+  store i32 %load, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@ -32,64 +32,62 @@

 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+  %tmp0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp3, ptr addrspace(1) %arrayidx13

-  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
-  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+  %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
+  store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4

-  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
-  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+  %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
+  store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8

-  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16

  ret void
 }

 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
 ; GCN: workgroup_group_segment_byte_size = 1072
-define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+  %tmp0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp3, ptr addrspace(1) %arrayidx13

-  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+  %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16

-  %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
-  store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
+  %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
+  store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8

-  %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
-  store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+  %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
+  store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4

  ret void
 }
@ -102,29 +100,28 @@ entry:

 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
 entry:
  %stack = alloca [5 x i32], align 4, addrspace(5)
-  %tmp0 = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
-  store i32 4, i32 addrspace(5)* %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
-  %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
-  store i32 5, i32 addrspace(5)* %arrayidx3, align 4
-  %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
-  store i32 %tmp2, i32 addrspace(1)* %out, align 4
-  %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
-  %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
-  store i32 %tmp3, i32 addrspace(1)* %arrayidx13
+  %tmp0 = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+  store i32 4, ptr addrspace(5) %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+  %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+  store i32 5, ptr addrspace(5) %arrayidx3, align 4
+  %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+  store i32 %tmp2, ptr addrspace(1) %out, align 4
+  %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+  %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+  store i32 %tmp3, ptr addrspace(1) %arrayidx13

-  %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
-  store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
+  %gep.lds3 = getelementptr inbounds [13 x i32], ptr addrspace(3) @lds3, i32 0, i32 %idx
+  store volatile i32 0, ptr addrspace(3) %gep.lds3, align 4

-  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
-  store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
+  %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], ptr addrspace(3) @lds4, i32 0, i32 %idx
+  store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds4, align 16

  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll
@ -4,25 +4,19 @@
 define i64 @test_pointer_array(i64 %v) {
 ; OPT-LABEL: @test_pointer_array(
 ; OPT-NEXT:  entry:
-; OPT-NEXT:    [[A:%.*]] = alloca [3 x i8*], align 16, addrspace(5)
-; OPT-NEXT:    [[GEP:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* [[A]], i32 0, i32 0
-; OPT-NEXT:    [[CAST:%.*]] = bitcast i8* addrspace(5)* [[GEP]] to i64 addrspace(5)*
-; OPT-NEXT:    [[TMP0:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
-; OPT-NEXT:    [[TMP1:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP0]], align 32
-; OPT-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to i8*
-; OPT-NEXT:    [[TMP3:%.*]] = insertelement <3 x i8*> [[TMP1]], i8* [[TMP2]], i32 0
-; OPT-NEXT:    store <3 x i8*> [[TMP3]], <3 x i8*> addrspace(5)* [[TMP0]], align 32
-; OPT-NEXT:    [[TMP4:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
-; OPT-NEXT:    [[TMP5:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP4]], align 32
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <3 x i8*> [[TMP5]], i32 0
-; OPT-NEXT:    [[TMP7:%.*]] = ptrtoint i8* [[TMP6]] to i64
+; OPT-NEXT:    [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
+; OPT-NEXT:    [[TMP1:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
+; OPT-NEXT:    [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <3 x ptr> [[TMP1]], ptr [[TMP2]], i32 0
+; OPT-NEXT:    store <3 x ptr> [[TMP3]], ptr addrspace(5) [[A]], align 32
+; OPT-NEXT:    [[TMP5:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
+; OPT-NEXT:    [[TMP6:%.*]] = extractelement <3 x ptr> [[TMP5]], i32 0
+; OPT-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
 ; OPT-NEXT:    ret i64 [[TMP7]]
 ;
 entry:
-  %a = alloca [3 x i8*], align 16, addrspace(5)
-  %gep = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* %a, i32 0, i32 0
-  %cast = bitcast i8* addrspace(5)* %gep to i64 addrspace(5)*
-  store i64 %v, i64 addrspace(5)* %cast, align 16
-  %ld = load i64, i64 addrspace(5)* %cast, align 16
+  %a = alloca [3 x ptr], align 16, addrspace(5)
+  store i64 %v, ptr addrspace(5) %a, align 16
+  %ld = load i64, ptr addrspace(5) %a, align 16
  ret i64 %ld
 }
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@ -5,22 +5,22 @@

 ; GCN-LABEL: {{^}}stored_lds_pointer_value:
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(ptr addrspace(1) %ptr) #0 {
  %tmp = alloca float, addrspace(5)
-  store float 0.0, float  addrspace(5)*%tmp
-  store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr
+  store float 0.0, ptr  addrspace(5) %tmp
+  store ptr addrspace(5) %tmp, ptr addrspace(1) %ptr
  ret void
 }

 ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(ptr addrspace(1) %ptr) #0 {
  %tmp0 = alloca float, addrspace(5)
  %tmp1 = alloca float, addrspace(5)
-  store float 0.0, float  addrspace(5)*%tmp0
-  store float 0.0, float  addrspace(5)*%tmp1
-  store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(1)* %ptr
-  store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr
+  store float 0.0, ptr  addrspace(5) %tmp0
+  store float 0.0, ptr  addrspace(5) %tmp1
+  store volatile ptr addrspace(5) %tmp0, ptr addrspace(1) %ptr
+  store volatile ptr addrspace(5) %tmp1, ptr addrspace(1) %ptr
  ret void
 }

@ -29,12 +29,12 @@ define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* a
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value_gep(float addrspace(5)* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(ptr addrspace(1) %ptr, i32 %idx) #0 {
 bb:
  %tmp = alloca float, i32 16, addrspace(5)
-  store float 0.0, float addrspace(5)* %tmp
-  %tmp2 = getelementptr inbounds float, float addrspace(5)* %tmp, i32 %idx
-  store float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr
+  store float 0.0, ptr addrspace(5) %tmp
+  %tmp2 = getelementptr inbounds float, ptr addrspace(5) %tmp, i32 %idx
+  store ptr addrspace(5) %tmp2, ptr addrspace(1) %ptr
  ret void
 }

@ -46,29 +46,27 @@ bb:
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define amdgpu_kernel void @stored_vector_pointer_value(i32 addrspace(5)* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp0 = alloca [4 x i32], addrspace(5)
-  %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 0
-  %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 1
-  %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 2
-  %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 %index
-  store i32 addrspace(5)* %tmp1, i32 addrspace(5)* addrspace(1)* %out
+  %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp0
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 %index
+  store ptr addrspace(5) %tmp1, ptr addrspace(1) %out
  ret void
 }

 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-NOT: ds_
 define amdgpu_kernel void @stored_fi_to_self() #0 {
-  %tmp = alloca i32 addrspace(5)*, addrspace(5)
-  store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp
-  %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)*
-  store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp
+  %tmp = alloca ptr addrspace(5), addrspace(5)
+  store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp
+  store volatile ptr addrspace(5) %tmp, ptr addrspace(5) %tmp
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll
@ -3,23 +3,22 @@
 ; This kernel starts with the amdgpu-no-workitem-id-* attributes, but
 ; need to be removed when these intrinsic uses are introduced.

-; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
-; CHECK: call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
+; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
 ; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
 ; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
 ; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
-define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
 entry:
  %tmp = alloca [2 x i32], addrspace(5)
-  %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  store i32 0, i32 addrspace(5)* %tmp1
-  store i32 1, i32 addrspace(5)* %tmp2
-  %tmp3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
-  %tmp4 = load i32, i32 addrspace(5)* %tmp3
-  %tmp5 = load volatile i32, i32 addrspace(1)* undef
+  %tmp2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %tmp2
+  %tmp3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+  %tmp4 = load i32, ptr addrspace(5) %tmp3
+  %tmp5 = load volatile i32, ptr addrspace(1) undef
  %tmp6 = add i32 %tmp4, %tmp5
-  store i32 %tmp6, i32 addrspace(1)* %out
+  store i32 %tmp6, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@ -7,8 +7,8 @@ target datalayout = "A5"
@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4

-@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
-@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
+@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4
+@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4

 ; This function cannot promote to using LDS because of the size of the
 ; constant expression use in the function, which was previously not
@ -18,22 +18,21 @@ target datalayout = "A5"

 ; ASM-LABEL: constant_expression_uses_all_lds:
 ; ASM: .amdhsa_group_segment_fixed_size 65536
-define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out

-  store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef
+  store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef
  ret void
 }

@ -45,21 +44,20 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
 ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef
  ret void
 }

@ -71,47 +69,44 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds:
 ; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  %gep_dyn_lds =  getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0
-  store i32 1234, i32* %gep_dyn_lds, align 4
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4
  ret void
 }

-declare void @callee(i8*)
+declare void @callee(ptr)

 ; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
 ; IR: alloca

 ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
 ; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
-define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr))
  ret void
 }

@ -121,21 +116,20 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
 ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr))
  ret void
 }

@ -144,21 +138,20 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level:
 ; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*))
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr))
  ret void
 }

@ -168,22 +161,21 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
 ; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out

-  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
+  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef
  ret void
 }

@ -195,21 +187,20 @@ entry:

 ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
 ; ASM: .group_segment_fixed_size: 65536
-define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
-  %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
-  %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
-  %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
-  store i32 9, i32 addrspace(5)* %gep0
-  store i32 10, i32 addrspace(5)* %gep1
-  store i32 99, i32 addrspace(5)* %gep2
-  store i32 43, i32 addrspace(5)* %gep3
-  %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
-  %load = load i32, i32 addrspace(5)* %arrayidx, align 4
-  store i32 %load, i32 addrspace(1)* %out
-  store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
+  %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+  %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+  %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+  store i32 9, ptr addrspace(5) %stack
+  store i32 10, ptr addrspace(5) %gep1
+  store i32 99, ptr addrspace(5) %gep2
+  store i32 43, ptr addrspace(5) %gep3
+  %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+  %load = load i32, ptr addrspace(5) %arrayidx, align 4
+  store i32 %load, ptr addrspace(1) %out
+  store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@ -2,86 +2,86 @@


 ; CHECK-LABEL: @branch_ptr_var_same_alloca(
-; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}

 ; CHECK: if:
-; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %a

 ; CHECK: else:
-; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %b

 ; CHECK: endif:
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
 define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
  br i1 undef, label %if, label %else

 if:
-  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
  br label %endif

 else:
-  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %b
+  %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
  br label %endif

 endif:
-  %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-  store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+  %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, ptr addrspace(5) %phi.ptr, align 4
  ret void
 }

 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ null, %entry ]
 define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
  br i1 undef, label %if, label %endif

 if:
-  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
  br label %endif

 endif:
-  %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ null, %entry ]
-  store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+  %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ null, %entry ]
+  store i32 0, ptr addrspace(5) %phi.ptr, align 4
  ret void
 }

 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
-; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
+; CHECK: %phi.ptr = phi ptr addrspace(3)  [ null, %entry ], [ %arrayidx0, %if ]
 define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
  br i1 undef, label %if, label %endif

 if:
-  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
  br label %endif

 endif:
-  %phi.ptr = phi i32 addrspace(5)* [ null, %entry ], [ %arrayidx0, %if ]
-  store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+  %phi.ptr = phi ptr addrspace(5) [ null, %entry ], [ %arrayidx0, %if ]
+  store i32 0, ptr addrspace(5) %phi.ptr, align 4
  ret void
 }

 ; CHECK-LABEL: @one_phi_value(
-; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
-; CHECK:  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: [[GEP0:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @one_phi_value.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK:  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP0]], i32 0, i32 %a

 ; CHECK: br label %exit
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
-; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %entry ]
+; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
 define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
-  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
  br label %exit

 exit:
-  %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %entry ]
-  store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+  %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %entry ]
+  store i32 0, ptr addrspace(5) %phi.ptr, align 4
  ret void
 }

@ -89,30 +89,30 @@ exit:
 ; CHECK: %alloca = alloca [64 x i32], align 4

 ; CHECK: if:
-; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a

 ; CHECK: else:
-; CHECK: %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
+; CHECK: %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()

 ; CHECK: endif:
-; CHECK: %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-; CHECK: store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, ptr addrspace(5) %phi.ptr, align 4
 define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
  br i1 undef, label %if, label %else

 if:
-  %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+  %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
  br label %endif

 else:
-  %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
+  %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()
  br label %endif

 endif:
-  %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-  store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+  %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+  store i32 0, ptr addrspace(5) %phi.ptr, align 4
  ret void
 }

@ -133,12 +133,12 @@ endif:

 ; CHECK-LABEL: @ptr_induction_var_same_alloca(
 ; CHECK: %alloca = alloca [64 x i32], align 4
-; CHECK: phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+; CHECK: phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
 define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
-  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
-  %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 48
+  %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+  %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 48
  br label %for.body

 for.cond.cleanup:                                 ; preds = %for.body
@ -146,11 +146,11 @@ for.cond.cleanup:                                 ; preds = %for.body

 for.body:                                         ; preds = %for.body, %entry
  %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
-  %p.08 = phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
-  store i32 %i.09, i32 addrspace(5)* %p.08, align 4
-  %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
+  %p.08 = phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+  store i32 %i.09, ptr addrspace(5) %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
  %inc = add nuw nsw i32 %i.09, 1
-  %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %arrayidx1
+  %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %arrayidx1
  br i1 %cmp, label %for.cond.cleanup, label %for.body
 }

@ -170,14 +170,14 @@ for.body:                                         ; preds = %for.body, %entry

 ; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
 ; CHECK: %alloca = alloca [64 x i32], align 4
-; CHECK: %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
-; CHECK: %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
+; CHECK: %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
 define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
 entry:
  %alloca = alloca [64 x i32], align 4, addrspace(5)
-  %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
-  %call = tail call i32 addrspace(5)* @get_unknown_pointer() #2
-  %cmp.7 = icmp eq i32 addrspace(5)* %arrayidx, %call
+  %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+  %call = tail call ptr addrspace(5) @get_unknown_pointer() #2
+  %cmp.7 = icmp eq ptr addrspace(5) %arrayidx, %call
  br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader

 for.body.preheader:                               ; preds = %entry
@ -191,14 +191,14 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo

 for.body:                                         ; preds = %for.body, %for.body.preheader
  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
-  store i32 %i.09, i32 addrspace(5)* %p.08, align 4
-  %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
+  %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+  store i32 %i.09, ptr addrspace(5) %p.08, align 4
+  %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
  %inc = add nuw nsw i32 %i.09, 1
-  %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
+  %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
 }

-declare i32 addrspace(5)* @get_unknown_pointer() #0
+declare ptr addrspace(5) @get_unknown_pointer() #0

 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@ -3,19 +3,18 @@
 ; This is just an arbitrary intrinisic that shouldn't ever need to be
 ; handled to ensure it doesn't crash.

-declare void @llvm.stackrestore(i8*) #2
+declare void @llvm.stackrestore(ptr) #2

 ; CHECK-LABEL: @try_promote_unhandled_intrinsic(
 ; CHECK: alloca
-; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+; CHECK: call void @llvm.stackrestore(ptr %tmp)
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(ptr addrspace(1) %arg) #2 {
 bb:
  %tmp = alloca i32, align 4
-  %tmp1 = bitcast i32* %tmp to i8*
-  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
-  %tmp3 = load i32, i32 addrspace(1)* %tmp2
-  store i32 %tmp3, i32* %tmp
-  call void @llvm.stackrestore(i8* %tmp1)
+  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+  %tmp3 = load i32, ptr addrspace(1) %tmp2
+  store i32 %tmp3, ptr %tmp
+  call void @llvm.stackrestore(ptr %tmp)
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll
@ -11,13 +11,13 @@
 ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
 ; GCN: store_dword v{{.+}}, [[RES]]

-; OPT:  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
-; OPT:  %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT:  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT:  store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
+; OPT:  %0 = load <4 x float>, ptr addrspace(5) %alloca
 ; OPT:  %1 = extractelement <4 x float> %0, i32 %sel2
-; OPT:  store float %1, float addrspace(1)* %out, align 4
+; OPT:  store float %1, ptr addrspace(1) %out, align 4

-define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x float>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -26,10 +26,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
-  %load = load float, float addrspace(5)* %gep, align 4
-  store float %load, float addrspace(1)* %out, align 4
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, ptr addrspace(5) %alloca, align 4
+  %load = load float, ptr addrspace(5) %gep, align 4
+  store float %load, ptr addrspace(1) %out, align 4
  ret void
 }

@ -46,14 +46,14 @@ entry:
 ; GCN:     v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
 ; GCN:     store_dwordx4 v{{.+}},

-; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
 ; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
-; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
-; OPT:  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+; OPT:  store <4 x float> %load, ptr addrspace(1) %out, align 4

-define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x float>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -62,10 +62,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store float 1.0, float addrspace(5)* %gep, align 4
-  %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
-  store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+  %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store float 1.0, ptr addrspace(5) %gep, align 4
+  %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+  store <4 x float> %load, ptr addrspace(1) %out, align 4
  ret void
 }

@ -77,13 +77,13 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]

-; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
-; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
 ; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
-; OPT: store half %1, half addrspace(1)* %out, align 2
+; OPT: store half %1, ptr addrspace(1) %out, align 2

-define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x half>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -92,10 +92,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
-  %load = load half, half addrspace(5)* %gep, align 2
-  store half %load, half addrspace(1)* %out, align 2
+  %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, ptr addrspace(5) %alloca, align 2
+  %load = load half, ptr addrspace(5) %gep, align 2
+  store half %load, ptr addrspace(1) %out, align 2
  ret void
 }

@ -105,14 +105,14 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff

-; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
 ; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
-; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
-; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2

-define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x half>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -121,10 +121,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store half 1.0, half addrspace(5)* %gep, align 4
-  %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
-  store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+  %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store half 1.0, ptr addrspace(5) %gep, align 4
+  %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+  store <4 x half> %load, ptr addrspace(1) %out, align 2
  ret void
 }

@ -136,13 +136,13 @@ entry:
 ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
 ; GCN:     v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]

-; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
-; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
 ; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
-; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
+; OPT: store i16 %1, ptr addrspace(1) %out, align 2

-define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x i16>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -151,10 +151,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
-  %load = load i16, i16 addrspace(5)* %gep, align 2
-  store i16 %load, i16 addrspace(1)* %out, align 2
+  %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+  %load = load i16, ptr addrspace(5) %gep, align 2
+  store i16 %load, ptr addrspace(1) %out, align 2
  ret void
 }

@ -164,14 +164,14 @@ entry:
 ; GCN-NOT: buffer_
 ; GCN:     s_mov_b64 s[{{[0-9:]+}}], 0xffff

-; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
 ; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
-; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
-; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
-; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2

-define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
 entry:
  %alloca = alloca <4 x i16>, align 16, addrspace(5)
  %x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -180,10 +180,10 @@ entry:
  %c2 = icmp uge i32 %y, 3
  %sel1 = select i1 %c1, i32 1, i32 2
  %sel2 = select i1 %c2, i32 0, i32 %sel1
-  %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-  store i16 1, i16 addrspace(5)* %gep, align 4
-  %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
-  store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+  %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+  store i16 1, ptr addrspace(5) %gep, align 4
+  %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+  store <4 x i16> %load, ptr addrspace(1) %out, align 2
  ret void
 }

@ -194,14 +194,12 @@ entry:
 ; GCN: v_mov_b32_e32 v1, 0

 ; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
-; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8

 define i64 @ptr_alloca_bitcast() {
 entry:
  %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-  %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
-  %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+  %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
  ret i64 %tmp1
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@ -2,26 +2,26 @@

 ; CHECK-LABEL: @volatile_load(
 ; CHECK: alloca [4 x i32]
-; CHECK: load volatile i32, i32 addrspace(5)*
-define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: load volatile i32, ptr addrspace(5)
+define amdgpu_kernel void @volatile_load(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
-  %load = load volatile i32, i32 addrspace(5)* %arrayidx1
-  store i32 %load, i32 addrspace(1)* %out
+  %tmp = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+  %load = load volatile i32, ptr addrspace(5) %arrayidx1
+  store i32 %load, ptr addrspace(1) %out
 ret void
 }

 ; CHECK-LABEL: @volatile_store(
 ; CHECK: alloca [4 x i32]
-; CHECK: store volatile i32 %tmp, i32 addrspace(5)*
-define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: store volatile i32 %tmp, ptr addrspace(5)
+define amdgpu_kernel void @volatile_store(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
 entry:
  %stack = alloca [4 x i32], align 4, addrspace(5)
-  %tmp = load i32, i32 addrspace(1)* %in, align 4
-  %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
-  store volatile i32 %tmp, i32 addrspace(5)* %arrayidx1
+  %tmp = load i32, ptr addrspace(1) %in, align 4
+  %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+  store volatile i32 %tmp, ptr addrspace(5) %arrayidx1
 ret void
 }

@ -30,15 +30,15 @@ entry:
 ; CHECK: alloca double
 ; CHECK: load double
 ; CHECK: load volatile double
-define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(ptr addrspace(1) nocapture %arg, i32 %arg1) #0 {
 bb:
  %tmp = alloca double, align 8, addrspace(5)
-  store double 0.000000e+00, double addrspace(5)* %tmp, align 8
+  store double 0.000000e+00, ptr addrspace(5) %tmp, align 8

-  %tmp4 = load double, double addrspace(5)* %tmp, align 8
-  %tmp5 = load volatile double, double addrspace(5)* %tmp, align 8
+  %tmp4 = load double, ptr addrspace(5) %tmp, align 8
+  %tmp5 = load volatile double, ptr addrspace(5) %tmp, align 8

-  store double %tmp4, double addrspace(1)* %arg
+  store double %tmp4, ptr addrspace(1) %arg
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll
@ -4,34 +4,34 @@

 ; CHECK-LABEL: @test_insertelement(
 ; CHECK:  %alloca = alloca i16
-; CHECK-NEXT:  insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
+; CHECK-NEXT:  insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
 define amdgpu_kernel void @test_insertelement() #0 {
 entry:
  %alloca = alloca i16, align 4, addrspace(5)
-  %in = insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
-  store <2 x i16 addrspace(5)*> %in, <2 x i16 addrspace(5)*>* undef, align 4
+  %in = insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
+  store <2 x ptr addrspace(5)> %in, ptr undef, align 4
  ret void
 }

 ; CHECK-LABEL: @test_insertvalue(
 ; CHECK:  %alloca = alloca i16
-; CHECK-NEXT:  insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
+; CHECK-NEXT:  insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
 define amdgpu_kernel void @test_insertvalue() #0 {
 entry:
  %alloca = alloca i16, align 4, addrspace(5)
-  %in = insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
-  store { i16 addrspace(5)* } %in, { i16 addrspace(5)* }* undef, align 4
+  %in = insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
+  store { ptr addrspace(5) } %in, ptr undef, align 4
  ret void
 }

 ; CHECK-LABEL: @test_insertvalue_array(
 ; CHECK:  %alloca = alloca i16
-; CHECK-NEXT:  insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
+; CHECK-NEXT:  insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
 define amdgpu_kernel void @test_insertvalue_array() #0 {
 entry:
  %alloca = alloca i16, align 4, addrspace(5)
-  %in = insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
-  store [2 x i16 addrspace(5)*] %in, [2 x i16 addrspace(5)*]* undef, align 4
+  %in = insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
+  store [2 x ptr addrspace(5)] %in, ptr undef, align 4
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll
@ -5,61 +5,58 @@
 ; OPT-LABEL: @vector_alloca_not_atomic(
 ;
 ; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
-define amdgpu_kernel void @vector_alloca_not_atomic(i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
 entry:
  %alloca = alloca [3 x i32], addrspace(5)
-  %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
-  %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
-  %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
-  store i32 0, i32 addrspace(5)* %a0
-  store i32 1, i32 addrspace(5)* %a1
-  store i32 2, i32 addrspace(5)* %a2
-  %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
-  %data = load i32, i32 addrspace(5)* %tmp
-  store i32 %data, i32 addrspace(1)* %out
+  %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+  %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+  store i32 0, ptr addrspace(5) %alloca
+  store i32 1, ptr addrspace(5) %a1
+  store i32 2, ptr addrspace(5) %a2
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %data = load i32, ptr addrspace(5) %tmp
+  store i32 %data, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_alloca_atomic_read(
 ;
 ; OPT: alloca [3 x i32]
-; OPT: store i32 0, i32 addrspace(5)*
-; OPT: store i32 1, i32 addrspace(5)*
-; OPT: store i32 2, i32 addrspace(5)*
-; OPT: load atomic i32, i32 addrspace(5)*
-define amdgpu_kernel void @vector_alloca_atomic_read(i32 addrspace(1)* %out, i64 %index) {
+; OPT: store i32 0, ptr addrspace(5)
+; OPT: store i32 1, ptr addrspace(5)
+; OPT: store i32 2, ptr addrspace(5)
+; OPT: load atomic i32, ptr addrspace(5)
+define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
 entry:
  %alloca = alloca [3 x i32], addrspace(5)
-  %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
-  %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
-  %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
-  store i32 0, i32 addrspace(5)* %a0
-  store i32 1, i32 addrspace(5)* %a1
-  store i32 2, i32 addrspace(5)* %a2
-  %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
-  %data = load atomic i32, i32 addrspace(5)* %tmp acquire, align 4
-  store i32 %data, i32 addrspace(1)* %out
+  %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+  %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+  store i32 0, ptr addrspace(5) %alloca
+  store i32 1, ptr addrspace(5) %a1
+  store i32 2, ptr addrspace(5) %a2
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
+  store i32 %data, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_alloca_atomic_write(
 ;
 ; OPT: alloca [3 x i32]
-; OPT: store atomic i32 0, i32 addrspace(5)
-; OPT: store atomic i32 1, i32 addrspace(5)
-; OPT: store atomic i32 2, i32 addrspace(5)
-; OPT: load i32, i32 addrspace(5)*
-define amdgpu_kernel void @vector_alloca_atomic_write(i32 addrspace(1)* %out, i64 %index) {
+; OPT: store atomic i32 0, ptr addrspace(5)
+; OPT: store atomic i32 1, ptr addrspace(5)
+; OPT: store atomic i32 2, ptr addrspace(5)
+; OPT: load i32, ptr addrspace(5)
+define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
 entry:
  %alloca = alloca [3 x i32], addrspace(5)
-  %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
-  %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
-  %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
-  store atomic i32 0, i32 addrspace(5)* %a0 release, align 4
-  store atomic i32 1, i32 addrspace(5)* %a1 release, align 4
-  store atomic i32 2, i32 addrspace(5)* %a2  release, align 4
-  %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
-  %data = load i32, i32 addrspace(5)* %tmp
-  store i32 %data, i32 addrspace(1)* %out
+  %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+  %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+  store atomic i32 0, ptr addrspace(5) %alloca release, align 4
+  store atomic i32 1, ptr addrspace(5) %a1 release, align 4
+  store atomic i32 2, ptr addrspace(5) %a2  release, align 4
+  %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+  %data = load i32, ptr addrspace(5) %tmp
+  store i32 %data, ptr addrspace(1) %out
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@ -7,7 +7,7 @@ target datalayout = "A5"
 ; OPT-LABEL: @vector_read_alloca_bitcast(
 ; OPT-NOT:   alloca
 ; OPT:       %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT-NEXT:  store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT-NEXT:  store i32 %0, ptr addrspace(1) %out, align 4

 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
 ; GCN-ALLOCA-COUNT-4: buffer_store_dword
@ -24,20 +24,19 @@ target datalayout = "A5"
 ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -45,7 +44,7 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT:       %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
 ; OPT-NEXT:  %1 = extractelement <4 x i32> %0, i32 %r_index
-; OPT-NEXT:  store i32 %1, i32 addrspace(1)* %out, align 
+; OPT-NEXT:  store i32 %1, ptr addrspace(1) %out, align

 ; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
 ; GCN-ALLOCA-COUNT-5: buffer_store_dword
@ -55,22 +54,21 @@ entry:

 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 0, i32 addrspace(5)* %y
-  store i32 0, i32 addrspace(5)* %z
-  store i32 0, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
-  store i32 1, i32 addrspace(5)* %tmp1
-  %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
-  %tmp3 = load i32, i32 addrspace(5)* %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 0, ptr addrspace(5) %y
+  store i32 0, ptr addrspace(5) %z
+  store i32 0, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
+  store i32 1, ptr addrspace(5) %tmp1
+  %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
+  %tmp3 = load i32, ptr addrspace(5) %tmp2
+  store i32 %tmp3, ptr addrspace(1) %out
  ret void
 }

@ -78,7 +76,7 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10
+; OPT:  %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x float> %0 to <6 x i32>
 ; OPT:  %1 = extractelement <6 x i32> %bc, i32 %tmp20
@ -106,31 +104,28 @@ entry:

 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
 bb:
  %tmp = alloca [6 x float], align 4, addrspace(5)
-  %tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2
  br label %bb2

 bb2:                                              ; preds = %bb2, %bb
  %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
  %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4
-  %tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)*
-  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4
+  %tmp7 = load i32, ptr addrspace(1) %tmp5, align 4
  %tmp8 = trunc i32 %tmp3 to i16
  %tmp9 = urem i16 %tmp8, 6
  %tmp10 = zext i16 %tmp9 to i32
-  %tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10
-  %tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)*
-  store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4
+  %tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
+  store i32 %tmp7, ptr addrspace(5) %tmp11, align 4
  %tmp13 = add nuw nsw i32 %tmp3, 1
  %tmp14 = icmp eq i32 %tmp13, 1000
  br i1 %tmp14, label %.preheader, label %bb2

 bb15:                                             ; preds = %.preheader
-  call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2
  ret void

 .preheader:                                       ; preds = %.preheader, %bb2
@ -139,13 +134,11 @@ bb15:                                             ; preds = %.preheader
  %tmp18 = urem i16 %tmp17, 6
  %tmp19 = sub nuw nsw i16 5, %tmp18
  %tmp20 = zext i16 %tmp19 to i32
-  %tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20
-  %tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)*
-  %tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4
+  %tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
+  %tmp23 = load i32, ptr addrspace(5) %tmp21, align 4
  %tmp24 = zext i32 %tmp16 to i64
-  %tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24
-  %tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)*
-  store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4
+  %tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24
+  store i32 %tmp23, ptr addrspace(1) %tmp25, align 4
  %tmp27 = add nuw nsw i32 %tmp16, 1
  %tmp28 = icmp eq i32 %tmp27, 1000
  br i1 %tmp28, label %bb15, label %.preheader
@ -155,7 +148,7 @@ bb15:                                             ; preds = %.preheader
 ; OPT-NOT:   alloca
 ; OPT: bb2:
 ; OPT:  %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
-; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10
+; OPT:  %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
 ; OPT: .preheader:
 ; OPT:  %bc = bitcast <6 x double> %0 to <6 x i64>
 ; OPT:  %1 = extractelement <6 x i64> %bc, i32 %tmp20
@ -172,31 +165,28 @@ bb15:                                             ; preds = %.preheader

 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) {
 bb:
  %tmp = alloca [6 x double], align 8, addrspace(5)
-  %tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
  br label %bb2

 bb2:                                              ; preds = %bb2, %bb
  %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
  %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4
-  %tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)*
-  %tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8
+  %tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4
+  %tmp7 = load i64, ptr addrspace(1) %tmp5, align 8
  %tmp8 = trunc i32 %tmp3 to i16
  %tmp9 = urem i16 %tmp8, 6
  %tmp10 = zext i16 %tmp9 to i32
-  %tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10
-  %tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)*
-  store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8
+  %tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
+  store i64 %tmp7, ptr addrspace(5) %tmp11, align 8
  %tmp13 = add nuw nsw i32 %tmp3, 1
  %tmp14 = icmp eq i32 %tmp13, 1000
  br i1 %tmp14, label %.preheader, label %bb2

 bb15:                                             ; preds = %.preheader
-  call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
  ret void

 .preheader:                                       ; preds = %.preheader, %bb2
@ -205,13 +195,11 @@ bb15:                                             ; preds = %.preheader
  %tmp18 = urem i16 %tmp17, 6
  %tmp19 = sub nuw nsw i16 5, %tmp18
  %tmp20 = zext i16 %tmp19 to i32
-  %tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20
-  %tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)*
-  %tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8
+  %tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
+  %tmp23 = load i64, ptr addrspace(5) %tmp21, align 8
  %tmp24 = zext i32 %tmp16 to i64
-  %tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24
-  %tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)*
-  store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8
+  %tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24
+  store i64 %tmp23, ptr addrspace(1) %tmp25, align 8
  %tmp27 = add nuw nsw i32 %tmp16, 1
  %tmp28 = icmp eq i32 %tmp27, 1000
  br i1 %tmp28, label %bb15, label %.preheader
@ -237,29 +225,28 @@ bb15:                                             ; preds = %.preheader

 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) {
 bb:
  %tmp = alloca [6 x i64], align 8, addrspace(5)
-  %tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)*
-  call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
  br label %bb2

 bb2:                                              ; preds = %bb2, %bb
  %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
  %tmp4 = zext i32 %tmp3 to i64
-  %tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4
-  %tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8
+  %tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4
+  %tmp6 = load i64, ptr addrspace(1) %tmp5, align 8
  %tmp7 = trunc i32 %tmp3 to i16
  %tmp8 = urem i16 %tmp7, 6
  %tmp9 = zext i16 %tmp8 to i32
-  %tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9
-  store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8
+  %tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9
+  store i64 %tmp6, ptr addrspace(5) %tmp10, align 8
  %tmp11 = add nuw nsw i32 %tmp3, 1
  %tmp12 = icmp eq i32 %tmp11, 1000
  br i1 %tmp12, label %.preheader, label %bb2

 bb13:                                             ; preds = %.preheader
-  call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+  call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
  ret void

 .preheader:                                       ; preds = %.preheader, %bb2
@ -268,11 +255,11 @@ bb13:                                             ; preds = %.preheader
  %tmp16 = urem i16 %tmp15, 6
  %tmp17 = sub nuw nsw i16 5, %tmp16
  %tmp18 = zext i16 %tmp17 to i32
-  %tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18
-  %tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8
+  %tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18
+  %tmp20 = load i64, ptr addrspace(5) %tmp19, align 8
  %tmp21 = zext i32 %tmp14 to i64
-  %tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21
-  store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8
+  %tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21
+  store i64 %tmp20, ptr addrspace(1) %tmp22, align 8
  %tmp23 = add nuw nsw i32 %tmp14, 1
  %tmp24 = icmp eq i32 %tmp23, 1000
  br i1 %tmp24, label %bb13, label %.preheader
@ -282,27 +269,26 @@ bb13:                                             ; preds = %.preheader

 ; OPT-LABEL: @vector_read_alloca_bitcast_assume(
 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4

 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
 ; GCN-COUNT-4: buffer_store_dword

-define amdgpu_kernel void @vector_read_alloca_bitcast_assume(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
-  %cmp = icmp ne i32 addrspace(5)* %x, null
+  %cmp = icmp ne ptr addrspace(5) %tmp, null
  call void @llvm.assume(i1 %cmp)
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -310,7 +296,7 @@ entry:
 ; OPT-NOT:   alloca
 ; OPT:       %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
 ; OPT-NEXT:  %add2 = add nuw nsw i32 %0, 1
-; OPT-NEXT:  store i32 %add2, i32 addrspace(1)* %out, align 4
+; OPT-NEXT:  store i32 %add2, ptr addrspace(1) %out, align 4

 ; GCN-LABEL: {{^}}vector_read_alloca_multiuse:
 ; GCN-ALLOCA-COUNT-4: buffer_store_dword
@ -328,31 +314,29 @@ entry:

 ; GCN-PROMOTE: ScratchSize: 0

-define amdgpu_kernel void @vector_read_alloca_multiuse(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %b = bitcast [4 x i32] addrspace(5)* %tmp to float addrspace(5)*
-  %x = bitcast float addrspace(5)* %b to i32 addrspace(5)*
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  %tmp3 = load i32, i32 addrspace(5)* %x
-  %tmp4 = load i32, i32 addrspace(5)* %y
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  %tmp3 = load i32, ptr addrspace(5) %tmp
+  %tmp4 = load i32, ptr addrspace(5) %y
  %add1 = add i32 %tmp2, %tmp3
  %add2 = add i32 %add1, %tmp4
-  store i32 %add2, i32 addrspace(1)* %out
+  store i32 %add2, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @bitcast_vector_to_vector(
 ; OPT-NOT:   alloca
-; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16

 ; GCN-LABEL: {{^}}bitcast_vector_to_vector:
 ; GCN: v_mov_b32_e32 v0, 1
@ -362,19 +346,18 @@ entry:

 ; GCN: ScratchSize: 0

-define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out)  {
+define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out)  {
 .entry:
  %alloca = alloca <4 x float>, align 16, addrspace(5)
-  %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
-  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
-  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
-  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
+  %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
+  store <4 x i32> %load, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_bitcast_from_alloca_array(
 ; OPT-NOT:   alloca
-; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16

 ; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
 ; GCN: v_mov_b32_e32 v0, 1
@ -384,26 +367,24 @@ define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out

 ; GCN: ScratchSize: 0

-define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out)  {
+define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out)  {
 .entry:
  %alloca = alloca [4 x float], align 16, addrspace(5)
-  %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
-  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
-  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
-  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
+  %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
+  store <4 x i32> %load, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
 ; OPT-NOT:   alloca
-; OPT:      %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
-; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
-; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
-; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
-; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1
+; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2
+; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3
+; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4

 ; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
 ; GCN: v_mov_b32_e32 v0, 1
@ -413,26 +394,23 @@ define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(

 ; GCN: ScratchSize: 0

-define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out)  {
-.entry:
+define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out)  {
  %alloca = alloca [4 x float], align 16, addrspace(5)
-  %cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
-  store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
-  %load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
-  store [4 x i32] %load, [4 x i32] addrspace(1)* %out
+  store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca
+  %load = load [4 x i32], ptr addrspace(5) %alloca, align 16
+  store [4 x i32] %load, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
 ; OPT-NOT:   alloca
-; OPT:      %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
-; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
-; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
-; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
-; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1
+; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2
+; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3
+; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4

 ; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
 ; GCN: v_mov_b32_e32 v0, 1
@ -444,18 +422,16 @@ define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] a

 %struct.v4 = type { i32, i32, i32, i32 }

-define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out)  {
-.entry:
+define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out)  {
  %alloca = alloca [4 x float], align 16, addrspace(5)
-  %cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
-  store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
-  %load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
-  store %struct.v4 %load, %struct.v4 addrspace(1)* %out
+  store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca
+  %load = load %struct.v4, ptr addrspace(5) %alloca, align 16
+  store %struct.v4 %load, ptr addrspace(1) %out
  ret void
 }

-declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)

-declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)

 declare void @llvm.assume(i1)
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll
@ -8,14 +8,13 @@ target datalayout = "A5"
 ; OPT: <8 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <8 x i64>
-define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @alloca_8xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
  %tmp = alloca [8 x i64], addrspace(5)
-  %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -24,14 +23,13 @@ entry:
 ; OPT-NOT: <9 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i64>
-define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @alloca_9xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
 entry:
  %tmp = alloca [9 x i64], addrspace(5)
-  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -40,14 +38,13 @@ entry:
 ; OPT: <16 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <16 x i64>
-define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_16xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
 entry:
  %tmp = alloca [16 x i64], addrspace(5)
-  %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [16 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -56,14 +53,13 @@ entry:
 ; OPT-NOT: <17 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <17 x i64>
-define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_17xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
 entry:
  %tmp = alloca [17 x i64], addrspace(5)
-  %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [17 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -72,14 +68,13 @@ entry:
 ; OPT-NOT: <9 x i128>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i128>
-define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_9xi128_max512(ptr addrspace(1) %out, i32 %index) #1 {
 entry:
  %tmp = alloca [9 x i128], addrspace(5)
-  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
-  store i128 0, i128 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i128, i128 addrspace(5)* %tmp1
-  store i128 %tmp2, i128 addrspace(1)* %out
+  store i128 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i128, ptr addrspace(5) %tmp1
+  store i128 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -88,14 +83,13 @@ entry:
 ; OPT: <9 x i128>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i128>
-define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
 entry:
  %tmp = alloca [9 x i128], addrspace(5)
-  %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
-  store i128 0, i128 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i128, i128 addrspace(5)* %tmp1
-  store i128 %tmp2, i128 addrspace(1)* %out
+  store i128 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i128, ptr addrspace(5) %tmp1
+  store i128 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -104,14 +98,13 @@ entry:
 ; OPT: <16 x i128>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <16 x i128>
-define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_16xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
 entry:
  %tmp = alloca [16 x i128], addrspace(5)
-  %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
-  store i128 0, i128 addrspace(5)* %x
-  %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i128, i128 addrspace(5)* %tmp1
-  store i128 %tmp2, i128 addrspace(1)* %out
+  store i128 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [16 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i128, ptr addrspace(5) %tmp1
+  store i128 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -120,14 +113,13 @@ entry:
 ; OPT-NOT: <9 x i256>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i256>
-define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi256_max256(ptr addrspace(1) %out, i32 %index) #2 {
 entry:
  %tmp = alloca [9 x i256], addrspace(5)
-  %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
-  store i256 0, i256 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i256, i256 addrspace(5)* %tmp1
-  store i256 %tmp2, i256 addrspace(1)* %out
+  store i256 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i256, ptr addrspace(5) %tmp1
+  store i256 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -136,14 +128,13 @@ entry:
 ; OPT: <9 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i64>
-define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
 entry:
  %tmp = alloca [9 x i64], addrspace(5)
-  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -152,14 +143,13 @@ entry:
 ; OPT-NOT: <9 x i64>
 ; LIMIT32: alloca
 ; LIMIT32-NOT: <9 x i64>
-define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
 entry:
  %tmp = alloca [9 x i64], addrspace(5)
-  %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
-  store i64 0, i64 addrspace(5)* %x
-  %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i64, i64 addrspace(5)* %tmp1
-  store i64 %tmp2, i64 addrspace(1)* %out
+  store i64 0, ptr addrspace(5) %tmp
+  %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i64, ptr addrspace(5) %tmp1
+  store i64 %tmp2, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@ -9,7 +9,7 @@ target datalayout = "A5"

 ; OPT-LABEL: @vector_read(
 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4

 ; FUNC-LABEL: {{^}}vector_read:
 ; EG: MOV
@ -17,27 +17,26 @@ target datalayout = "A5"
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp2, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_write(
 ; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
 ; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
-; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %1, ptr addrspace(1) %out, align 4

 ; FUNC-LABEL: {{^}}vector_write:
 ; EG: MOV
@ -46,91 +45,83 @@ entry:
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 0, i32 addrspace(5)* %y
-  store i32 0, i32 addrspace(5)* %z
-  store i32 0, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
-  store i32 1, i32 addrspace(5)* %tmp1
-  %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
-  %tmp3 = load i32, i32 addrspace(5)* %tmp2
-  store i32 %tmp3, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 0, ptr addrspace(5) %y
+  store i32 0, ptr addrspace(5) %z
+  store i32 0, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
+  store i32 1, ptr addrspace(5) %tmp1
+  %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
+  %tmp3 = load i32, ptr addrspace(5) %tmp2
+  store i32 %tmp3, ptr addrspace(1) %out
  ret void
 }

 ; This test should be optimize to:
-; store i32 0, i32 addrspace(1)* %out
+; store i32 0, ptr addrspace(1) %out

 ; OPT-LABEL: @bitcast_gep(
-; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
+; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4

 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 0, i32 addrspace(5)* %y
-  store i32 0, i32 addrspace(5)* %z
-  store i32 0, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
-  %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
-  %tmp4 = load i32, i32 addrspace(5)* %tmp3
-  store i32 %tmp4, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 0, ptr addrspace(5) %y
+  store i32 0, ptr addrspace(5) %z
+  store i32 0, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %tmp4 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp4, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
-  store float 1.0, float addrspace(5)* %bc
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store float 1.0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp2, ptr addrspace(1) %out
  ret void
 }

 ; OPT-LABEL: @vector_read_bitcast_alloca(
 ; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
-; OPT: store float %0, float addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+; OPT: store float %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
-  %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
-  %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
-  %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
-  %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
-  store float 0.0, float addrspace(5)* %x
-  store float 1.0, float addrspace(5)* %y
-  store float 2.0, float addrspace(5)* %z
-  store float 4.0, float addrspace(5)* %w
-  %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
-  %tmp2 = load float, float addrspace(5)* %tmp1
-  store float %tmp2, float addrspace(1)* %out
+  %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3
+  store float 0.0, ptr addrspace(5) %tmp
+  store float 1.0, ptr addrspace(5) %y
+  store float 2.0, ptr addrspace(5) %z
+  store float 4.0, ptr addrspace(5) %w
+  %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load float, ptr addrspace(5) %tmp1
+  store float %tmp2, ptr addrspace(1) %out
  ret void
 }

@ -138,20 +129,19 @@ entry:

 ; OPT-LABEL: @vector_read_with_local_arg(
 ; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) {
 entry:
  %tmp = alloca [4 x i32], addrspace(5)
-  %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
-  store i32 0, i32 addrspace(5)* %x
-  store i32 1, i32 addrspace(5)* %y
-  store i32 2, i32 addrspace(5)* %z
-  store i32 3, i32 addrspace(5)* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
-  %tmp2 = load i32, i32 addrspace(5)* %tmp1
-  store i32 %tmp2, i32 addrspace(1)* %out
+  %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+  store i32 0, ptr addrspace(5) %tmp
+  store i32 1, ptr addrspace(5) %y
+  store i32 2, ptr addrspace(5) %z
+  store i32 3, ptr addrspace(5) %w
+  %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+  %tmp2 = load i32, ptr addrspace(5) %tmp1
+  store i32 %tmp2, ptr addrspace(1) %out
  ret void
 }