diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll index 5a830b942d5d..c18d9941bfd5 100644 --- a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll @@ -6,45 +6,43 @@ ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4 -define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4 -define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 { +define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } @@ -52,69 +50,66 @@ entry: ; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4 ; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4 -define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 { +define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } ; ALL-LABEL: @occupancy_0( ; CI-NOT: alloca [5 x i32] ; SI: alloca [5 x i32] -define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 { +define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } ; ALL-LABEL: @occupancy_max( ; CI-NOT: alloca [5 x i32] ; SI: alloca [5 x i32] -define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 { +define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } @@ -122,25 +117,24 @@ entry: ; CI-LABEL: @occupancy_6( ; SI: alloca ; CI-NOT: alloca -define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 { entry: %stack = alloca [42 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } @@ -148,25 +142,24 @@ entry: ; SICI: alloca [43 x i8] ; GFX10PLUS-NOT: alloca -define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 { +define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 { entry: %stack = alloca [43 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } @@ -174,25 +167,24 @@ entry: ; CI-LABEL: @occupancy_8( ; SI: alloca ; CI-NOT: alloca -define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 { entry: %stack = alloca [32 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } @@ -200,25 +192,24 @@ entry: ; SICI: alloca [33 x i8] ; GFX10PLUS-NOT: alloca -define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 { +define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 { entry: %stack = alloca [33 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } @@ -226,25 +217,24 @@ entry: ; CI-LABEL: @occupancy_9( ; SI: alloca ; CI-NOT: alloca -define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 { entry: %stack = alloca [28 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } @@ -252,25 +242,24 @@ entry: ; SICI: alloca [29 x i8] ; GFX10PLUS-NOT: alloca -define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 { +define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 { entry: %stack = alloca [29 x i8], align 4, addrspace(5) - %tmp = load i8, i8 addrspace(1)* %in, align 1 + %tmp = load i8, ptr addrspace(1) %in, align 1 %tmp4 = sext i8 %tmp to i64 - %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4 - store i8 4, i8 addrspace(5)* %arrayidx1, align 1 - %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1 - %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1 + %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4 + store i8 4, ptr addrspace(5) %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1 + %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1 %tmp5 = sext i8 %tmp1 to i64 - %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5 - store i8 5, i8 addrspace(5)* %arrayidx3, align 1 - %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 0 - %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1 - store i8 %tmp2, i8 addrspace(1)* %out, align 1 - %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 1 - %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1 - %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1 - store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1 + %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5 + store i8 5, ptr addrspace(5) %arrayidx3, align 1 + %tmp2 = load i8, ptr addrspace(5) %stack, align 1 + store i8 %tmp2, ptr addrspace(1) %out, align 1 + %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1 + %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1 + %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1 + store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll index 5b1959801d87..40e838902f38 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll @@ -18,44 +18,40 @@ define amdgpu_vs void @promote_1d_aggr() #0 { ; CHECK-LABEL: @promote_1d_aggr( ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5) -; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 1 -; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 0 -; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], [1 x float] addrspace(1)* [[FOO2]], align 4 -; CHECK-NEXT: store [1 x float] [[FOO3]], [1 x float] addrspace(5)* [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], [1 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[FOO6:%.*]] = load float, float addrspace(5)* [[FOO5]], align 4 +; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1 +; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 +; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4 +; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4 ; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16 +; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 ; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0 ; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1 ; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2 ; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3 -; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0 -; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16 +; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; %i = alloca i32, addrspace(5) %f1 = alloca [1 x float], addrspace(5) - %foo = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1 - %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32 addrspace(5)* %i - %foo2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0 - %foo3 = load [1 x float], [1 x float] addrspace(1)* %foo2 - store [1 x float] %foo3, [1 x float] addrspace(5)* %f1 - %foo4 = load i32, i32 addrspace(5)* %i - %foo5 = getelementptr [1 x float], [1 x float] addrspace(5)* %f1, i32 0, i32 %foo4 - %foo6 = load float, float addrspace(5)* %foo5 + %foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1 + %foo1 = load i32, ptr addrspace(1) %foo + store i32 %foo1, ptr addrspace(5) %i + %foo3 = load [1 x float], ptr addrspace(1) @block + store [1 x float] %foo3, ptr addrspace(5) %f1 + %foo4 = load i32, ptr addrspace(5) %i + %foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + %foo6 = load float, ptr addrspace(5) %foo5 %foo7 = alloca <4 x float>, addrspace(5) - %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7 + %foo8 = load <4 x float>, ptr addrspace(5) %foo7 %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3 - %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0 - store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13 + store <4 x float> %foo12, ptr addrspace(1) @pv ret void } @@ -66,44 +62,36 @@ define amdgpu_vs void @promote_store_aggr() #0 { ; CHECK-LABEL: @promote_store_aggr( ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) -; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK2:%.*]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 0 -; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 +; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4 +; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 +; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 ; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float -; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0 -; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float> addrspace(5)* [[TMP1]], align 8 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1 -; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float> addrspace(5)* [[TMP4]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float] addrspace(5)* [[F1]], align 4 -; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1 -; CHECK-NEXT: store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4 -; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0 -; CHECK-NEXT: store <4 x float> , <4 x float> addrspace(1)* [[FOO8]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0 +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 8 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1 +; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 8 +; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1 +; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4 +; CHECK-NEXT: store <4 x float> , ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; %i = alloca i32, addrspace(5) %f1 = alloca [2 x float], addrspace(5) - %foo = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0 - %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32 addrspace(5)* %i - %foo2 = load i32, i32 addrspace(5)* %i + %foo1 = load i32, ptr addrspace(1) @block2 + store i32 %foo1, ptr addrspace(5) %i + %foo2 = load i32, ptr addrspace(5) %i %foo3 = sitofp i32 %foo2 to float - %foo4 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 0 - store float %foo3, float addrspace(5)* %foo4 - %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 1 - store float 2.000000e+00, float addrspace(5)* %foo5 - %foo6 = load [2 x float], [2 x float] addrspace(5)* %f1 - %foo7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1 - store [2 x float] %foo6, [2 x float] addrspace(1)* %foo7 - %foo8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0 - store <4 x float> , <4 x float> addrspace(1)* %foo8 + store float %foo3, ptr addrspace(5) %f1 + %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1 + store float 2.000000e+00, ptr addrspace(5) %foo5 + %foo6 = load [2 x float], ptr addrspace(5) %f1 + %foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1 + store [2 x float] %foo6, ptr addrspace(1) %foo7 + store <4 x float> , ptr addrspace(1) @pv ret void } @@ -114,46 +102,41 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { ; CHECK-LABEL: @promote_load_from_store_aggr( ; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5) ; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5) -; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 1 -; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4 -; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK3]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 0 -; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], [2 x float] addrspace(1)* [[FOO2]], align 4 -; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float] addrspace(5)* [[F1]], align 4 -; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4 -; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]] +; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1 +; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4 +; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4 +; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4 +; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4 +; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4 +; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]] ; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5) -; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16 -; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0 -; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3 -; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0 -; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16 +; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16 +; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0 +; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1 +; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2 +; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3 +; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16 ; CHECK-NEXT: ret void ; %i = alloca i32, addrspace(5) %f1 = alloca [2 x float], addrspace(5) - %foo = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1 - %foo1 = load i32, i32 addrspace(1)* %foo - store i32 %foo1, i32 addrspace(5)* %i - %foo2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0 - %foo3 = load [2 x float], [2 x float] addrspace(1)* %foo2 - store [2 x float] %foo3, [2 x float] addrspace(5)* %f1 - %foo4 = load i32, i32 addrspace(5)* %i - %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 %foo4 - %foo6 = load float, float addrspace(5)* %foo5 + %foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1 + %foo1 = load i32, ptr addrspace(1) %foo + store i32 %foo1, ptr addrspace(5) %i + %foo3 = load [2 x float], ptr addrspace(1) @block3 + store [2 x float] %foo3, ptr addrspace(5) %f1 + %foo4 = load i32, ptr addrspace(5) %i + %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4 + %foo6 = load float, ptr addrspace(5) %foo5 %foo7 = alloca <4 x float>, addrspace(5) - %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7 + %foo8 = load <4 x float>, ptr addrspace(5) %foo7 %foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0 %foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1 %foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2 %foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3 - %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0 - store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13 + store <4 x float> %foo12, ptr addrspace(1) @pv ret void } @@ -163,70 +146,61 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 { define amdgpu_ps void @promote_double_aggr() #0 { ; CHECK-LABEL: @promote_double_aggr( ; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5) -; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0 -; CHECK-NEXT: [[FOO1:%.*]] = load double, double addrspace(1)* [[FOO]], align 8 -; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1 -; CHECK-NEXT: [[FOO3:%.*]] = load double, double addrspace(1)* [[FOO2]], align 8 +; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 +; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8 +; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 +; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8 ; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0 ; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1 -; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double] addrspace(5)* [[S]], align 8 -; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP1]], align 16 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1 -; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP4]], align 16 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1 -; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]] -; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP7]], align 16 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0 -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double> addrspace(5)* [[TMP7]], align 16 -; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* -; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP10]], align 16 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 -; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1 -; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)* -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP13]], align 16 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1 -; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]] +; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8 +; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1 +; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1 +; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0 +; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1 +; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float ; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0 ; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1 ; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2 ; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3 -; CHECK-NEXT: store <4 x float> [[FOO21]], <4 x float> addrspace(1)* @frag_color, align 16 +; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16 ; CHECK-NEXT: ret void ; %s = alloca [2 x double], addrspace(5) - %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0 - %foo1 = load double, double addrspace(1)* %foo - %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1 - %foo3 = load double, double addrspace(1)* %foo2 + %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0 + %foo1 = load double, ptr addrspace(1) %foo + %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1 + %foo3 = load double, ptr addrspace(1) %foo2 %foo4 = insertvalue [2 x double] undef, double %foo1, 0 %foo5 = insertvalue [2 x double] %foo4, double %foo3, 1 - store [2 x double] %foo5, [2 x double] addrspace(5)* %s - %foo6 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 - %foo7 = load double, double addrspace(5)* %foo6 - %foo8 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 - %foo9 = load double, double addrspace(5)* %foo8 + store [2 x double] %foo5, ptr addrspace(5) %s + %foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 + %foo7 = load double, ptr addrspace(5) %foo6 + %foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 + %foo9 = load double, ptr addrspace(5) %foo8 %foo10 = fadd double %foo7, %foo9 - %foo11 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0 - store double %foo10, double addrspace(5)* %foo11 - %foo12 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0 - %foo13 = load double, double addrspace(5)* %foo12 - %foo14 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1 - %foo15 = load double, double addrspace(5)* %foo14 + store double %foo10, ptr addrspace(5) %s + %foo13 = load double, ptr addrspace(5) %s + %foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1 + %foo15 = load double, ptr addrspace(5) %foo14 %foo16 = fadd double %foo13, %foo15 %foo17 = fptrunc double %foo16 to float %foo18 = insertelement <4 x float> undef, float %foo17, i32 0 %foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1 %foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2 %foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3 - store <4 x float> %foo21, <4 x float> addrspace(1)* @frag_color + store <4 x float> %foo21, ptr addrspace(1) @frag_color ret void } @@ -234,22 +208,21 @@ define amdgpu_ps void @promote_double_aggr() #0 { define amdgpu_kernel void @alloca_struct() #0 { ; CHECK-LABEL: @alloca_struct( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)* -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4, !invariant.load !0 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 2 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32 addrspace(4)* [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0 -; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]] -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]] -; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], [1024 x [2 x %struct]] addrspace(3)* @alloca_struct.alloca, i32 0, i32 [[TMP14]] +; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll index 9d6f10bda03f..a590bbc8023e 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll @@ -5,45 +5,43 @@ ; CHECK-LABEL: @array_alloca( ; CHECK: %stack = alloca i32, i32 5, align 4, addrspace(5) -define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define amdgpu_kernel void @array_alloca(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca i32, i32 5, align 4, addrspace(5) - %ld0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0 - %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %ld2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1 - %ld3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %ld3, i32 addrspace(1)* %arrayidx13 + %ld0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %ld2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %ld2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1 + %ld3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %ld3, ptr addrspace(1) %arrayidx13 ret void } ; CHECK-LABEL: @array_alloca_dynamic( ; CHECK: %stack = alloca i32, i32 %size, align 4, addrspace(5) -define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 { +define amdgpu_kernel void @array_alloca_dynamic(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %size) #0 { entry: %stack = alloca i32, i32 %size, align 4, addrspace(5) - %ld0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0 - %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %ld2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1 - %ld3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %ld3, i32 addrspace(1)* %arrayidx13 + %ld0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %ld2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %ld2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1 + %ld3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %ld3, ptr addrspace(1) %arrayidx13 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll index c18b80fd221f..ec83d7f313d6 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-calling-conv.ll @@ -1,28 +1,27 @@ ; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=IR %s ; RUN: llc -march=amdgcn -mcpu=fiji -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=ASM %s -; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { +; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 { ; IR: alloca [5 x i32] ; ASM-LABEL: {{^}}promote_alloca_shaders: ; ASM: ; ScratchSize: 24 -define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 { +define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %tmp2 = load i32, i32 addrspace(5)* %arrayidx4, align 4 - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %tmp3 = load i32, i32 addrspace(5)* %arrayidx5 - %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp3, i32 addrspace(1)* %arrayidx6 + %tmp0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %tmp2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %tmp2, ptr addrspace(1) %out, align 4 + %arrayidx5 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %tmp3 = load i32, ptr addrspace(5) %arrayidx5 + %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %tmp3, ptr addrspace(1) %arrayidx6 ret void } @@ -33,18 +32,17 @@ entry: ; ASM-LABEL: {{^}}promote_to_vector_call_c: ; ASM-NOT: LDSByteSize ; ASM: ; ScratchSize: 12 -define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 { +define void @promote_to_vector_call_c(ptr addrspace(1) %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) - %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %tmp1 - store i32 1, i32 addrspace(5)* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in - %tmp4 = load i32, i32 addrspace(5)* %tmp3 - %tmp5 = load volatile i32, i32 addrspace(1)* undef + %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %tmp2 + %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in + %tmp4 = load i32, ptr addrspace(5) %tmp3 + %tmp5 = load volatile i32, ptr addrspace(1) undef %tmp6 = add i32 %tmp4, %tmp5 - store i32 %tmp6, i32 addrspace(1)* %out + store i32 %tmp6, ptr addrspace(1) %out ret void } @@ -54,43 +52,41 @@ entry: ; ASM-LABEL: {{^}}no_promote_to_lds_c: ; ASM-NOT: LDSByteSize ; ASM: ; ScratchSize: 24 -define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 { +define void @no_promote_to_lds_c(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %3, i32 addrspace(1)* %arrayidx13 + %0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %3, ptr addrspace(1) %arrayidx13 ret void } -declare i32 @foo(i32 addrspace(5)*) #0 +declare i32 @foo(ptr addrspace(5)) #0 ; ASM-LABEL: {{^}}call_private: ; ASM: buffer_store_dword ; ASM: buffer_store_dword ; ASM: s_swappc_b64 ; ASM: ScratchSize: 16400 -define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) - %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %tmp1 - store i32 1, i32 addrspace(5)* %tmp2 - %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in - %val = call i32 @foo(i32 addrspace(5)* %tmp3) - store i32 %val, i32 addrspace(1)* %out + %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %tmp2 + %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in + %val = call i32 @foo(ptr addrspace(5) %tmp3) + store i32 %val, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll index a3f988c5acb9..3071562c85ca 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll @@ -1,23 +1,22 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s target datalayout = "A5" -declare {}* @llvm.invariant.start.p5i8(i64, i8 addrspace(5)* nocapture) #0 -declare void @llvm.invariant.end.p5i8({}*, i64, i8 addrspace(5)* nocapture) #0 -declare i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)*) #1 +declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture) #0 +declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture) #0 +declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5)) #1 ; GCN-LABEL: {{^}}use_invariant_promotable_lds: ; GCN: buffer_load_dword ; GCN: ds_write_b32 -define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 { +define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 { bb: %tmp = alloca i32, align 4, addrspace(5) - %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)* - %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 - %tmp3 = load i32, i32 addrspace(1)* %tmp2 - store i32 %tmp3, i32 addrspace(5)* %tmp - %tmp4 = call {}* @llvm.invariant.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) #0 - call void @llvm.invariant.end.p5i8({}* %tmp4, i64 4, i8 addrspace(5)* %tmp1) #0 - %tmp5 = call i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)* %tmp1) #1 + %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 + %tmp3 = load i32, ptr addrspace(1) %tmp2 + store i32 %tmp3, ptr addrspace(5) %tmp + %tmp4 = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #0 + call void @llvm.invariant.end.p5(ptr %tmp4, i64 4, ptr addrspace(5) %tmp) #0 + %tmp5 = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %tmp) #1 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll index 51dc94f19a81..359d45620372 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll @@ -2,22 +2,21 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" -declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0 -declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0 +declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0 +declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0 ; OPT-LABEL: @use_lifetime_promotable_lds( ; OPT-NOT: alloca i32 ; OPT-NOT: llvm.lifetime -; OPT: store i32 %tmp3, i32 addrspace(3)* -define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 { +; OPT: store i32 %tmp3, ptr addrspace(3) +define amdgpu_kernel void @use_lifetime_promotable_lds(ptr addrspace(1) %arg) #2 { bb: %tmp = alloca i32, align 4, addrspace(5) - %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) - %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 - %tmp3 = load i32, i32 addrspace(1)* %tmp2 - store i32 %tmp3, i32 addrspace(5)* %tmp - call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %tmp1) + call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %tmp) + %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 + %tmp3 = load i32, ptr addrspace(1) %tmp2 + store i32 %tmp3, ptr addrspace(5) %tmp + call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %tmp) ret void } @@ -29,7 +28,7 @@ bb: define amdgpu_kernel void @iterator_erased_lifetime() { entry: %alloca = alloca i8, align 1, addrspace(5) - call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %alloca) + call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll index 548ec44daf8f..0bba1bdce956 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll @@ -1,95 +1,77 @@ -; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s -declare void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 -declare void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0 -declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0 +declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0 +declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0 +declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0 -declare void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0 -declare void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0 -declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0 +declare void @llvm.memmove.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0 +declare void @llvm.memmove.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0 +declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0 -declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i1) #0 +declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i1) #0 -declare i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)*, i1, i1, i1) #1 +declare i32 @llvm.objectsize.i32.p5(ptr addrspace(5), i1, i1, i1) #1 ; CHECK-LABEL: @promote_with_memcpy( -; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) -; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false) -define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false) +; CHECK: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false) +define amdgpu_kernel void @promote_with_memcpy(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %alloca = alloca [17 x i32], align 4, addrspace(5) - %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* - %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* - %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) - call void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false) + call void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false) + call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false) ret void } ; CHECK-LABEL: @promote_with_memmove( -; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) -; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false) -define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: call void @llvm.memmove.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false) +; CHECK: call void @llvm.memmove.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false) +define amdgpu_kernel void @promote_with_memmove(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %alloca = alloca [17 x i32], align 4, addrspace(5) - %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* - %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* - %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false) - call void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false) + call void @llvm.memmove.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false) + call void @llvm.memmove.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false) ret void } ; CHECK-LABEL: @promote_with_memset( -; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 7, i32 68, i1 false) -define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { +; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: call void @llvm.memset.p3.i32(ptr addrspace(3) align 4 [[GEP]], i8 7, i32 68, i1 false) +define amdgpu_kernel void @promote_with_memset(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { %alloca = alloca [17 x i32], align 4, addrspace(5) - %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* - %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)* - %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)* - call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 7, i32 68, i1 false) + call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 7, i32 68, i1 false) ret void } ; CHECK-LABEL: @promote_with_objectsize( -; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} -; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false, i1 false) -define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 { +; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false) +define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 { %alloca = alloca [17 x i32], align 4, addrspace(5) - %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)* - %size = call i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)* %alloca.bc, i1 false, i1 false, i1 false) - store i32 %size, i32 addrspace(1)* %out + %size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, i1 false, i1 false) + store i32 %size, ptr addrspace(1) %out ret void } ; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy( -; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)* -; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)* -; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) +; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false) define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) { entry: %r = alloca double, align 8, addrspace(5) - %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1 - %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)* - %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c - %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)* - call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false) + %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1 + %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c + call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false) ret void } ; CHECK-LABEL: @promote_alloca_used_twice_in_memmove( -; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)* -; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)* -; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false) +; CHECK: call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false) define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) { entry: %r = alloca double, align 8, addrspace(5) - %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1 - %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)* - %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c - %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)* - call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false) + %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1 + %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c + call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll index 27d2f0328309..bfad67e82311 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll @@ -5,32 +5,30 @@ ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}} ; NOOPTS-NOT: ds_write ; OPTS: ds_write -define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #0 { entry: %alloca = alloca [2 x [2 x i32]], addrspace(5) - %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32 addrspace(5)* %gep2 - store i32 %load, i32 addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index + %load = load i32, ptr addrspace(5) %gep2 + store i32 %load, ptr addrspace(1) %out ret void } ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array: ; ALL: workgroup_group_segment_byte_size = 0{{$}} ; ALL-NOT: ds_write -define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #1 { entry: %alloca = alloca [2 x [2 x i32]], addrspace(5) - %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0 - %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %gep0 - store i32 1, i32 addrspace(5)* %gep1 - %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index - %load = load i32, i32 addrspace(5)* %gep2 - store i32 %load, i32 addrspace(1)* %out + %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %gep1 + %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index + %load = load i32, ptr addrspace(5) %gep2 + store i32 %load, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll index d63bd451da53..d314b4695304 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll @@ -32,64 +32,62 @@ ; GCN-LABEL: {{^}}promote_alloca_size_order_0: ; GCN: workgroup_group_segment_byte_size = 1060 -define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + %tmp0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %tmp2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %tmp2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %tmp3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %tmp3, ptr addrspace(1) %arrayidx13 - %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx - store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4 + %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx + store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4 - %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx - store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8 + %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx + store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8 - %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx - store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16 + %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16 ret void } ; GCN-LABEL: {{^}}promote_alloca_size_order_1: ; GCN: workgroup_group_segment_byte_size = 1072 -define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_size_order_1(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + %tmp0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %tmp2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %tmp2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %tmp3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %tmp3, ptr addrspace(1) %arrayidx13 - %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx - store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16 + %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16 - %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx - store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8 + %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx + store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8 - %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx - store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4 + %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx + store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4 ret void } @@ -102,29 +100,28 @@ entry: ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit: ; GCN: workgroup_group_segment_byte_size = 1060 -define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 { +define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 { entry: %stack = alloca [5 x i32], align 4, addrspace(5) - %tmp0 = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0 - store i32 4, i32 addrspace(5)* %arrayidx1, align 4 - %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1 - %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1 - store i32 5, i32 addrspace(5)* %arrayidx3, align 4 - %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0 - %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4 - store i32 %tmp2, i32 addrspace(1)* %out, align 4 - %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1 - %tmp3 = load i32, i32 addrspace(5)* %arrayidx12 - %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1 - store i32 %tmp3, i32 addrspace(1)* %arrayidx13 + %tmp0 = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0 + store i32 4, ptr addrspace(5) %arrayidx1, align 4 + %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1 + %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1 + store i32 5, ptr addrspace(5) %arrayidx3, align 4 + %tmp2 = load i32, ptr addrspace(5) %stack, align 4 + store i32 %tmp2, ptr addrspace(1) %out, align 4 + %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %tmp3 = load i32, ptr addrspace(5) %arrayidx12 + %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1 + store i32 %tmp3, ptr addrspace(1) %arrayidx13 - %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx - store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4 + %gep.lds3 = getelementptr inbounds [13 x i32], ptr addrspace(3) @lds3, i32 0, i32 %idx + store volatile i32 0, ptr addrspace(3) %gep.lds3, align 4 - %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx - store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16 + %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], ptr addrspace(3) @lds4, i32 0, i32 %idx + store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds4, align 16 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll index 1b933ddf9df6..919710f9a8de 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-pointer-array.ll @@ -4,25 +4,19 @@ define i64 @test_pointer_array(i64 %v) { ; OPT-LABEL: @test_pointer_array( ; OPT-NEXT: entry: -; OPT-NEXT: [[A:%.*]] = alloca [3 x i8*], align 16, addrspace(5) -; OPT-NEXT: [[GEP:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* [[A]], i32 0, i32 0 -; OPT-NEXT: [[CAST:%.*]] = bitcast i8* addrspace(5)* [[GEP]] to i64 addrspace(5)* -; OPT-NEXT: [[TMP0:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)* -; OPT-NEXT: [[TMP1:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP0]], align 32 -; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to i8* -; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x i8*> [[TMP1]], i8* [[TMP2]], i32 0 -; OPT-NEXT: store <3 x i8*> [[TMP3]], <3 x i8*> addrspace(5)* [[TMP0]], align 32 -; OPT-NEXT: [[TMP4:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)* -; OPT-NEXT: [[TMP5:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP4]], align 32 -; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x i8*> [[TMP5]], i32 0 -; OPT-NEXT: [[TMP7:%.*]] = ptrtoint i8* [[TMP6]] to i64 +; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5) +; OPT-NEXT: [[TMP1:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32 +; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to ptr +; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x ptr> [[TMP1]], ptr [[TMP2]], i32 0 +; OPT-NEXT: store <3 x ptr> [[TMP3]], ptr addrspace(5) [[A]], align 32 +; OPT-NEXT: [[TMP5:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32 +; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x ptr> [[TMP5]], i32 0 +; OPT-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 ; OPT-NEXT: ret i64 [[TMP7]] ; entry: - %a = alloca [3 x i8*], align 16, addrspace(5) - %gep = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* %a, i32 0, i32 0 - %cast = bitcast i8* addrspace(5)* %gep to i64 addrspace(5)* - store i64 %v, i64 addrspace(5)* %cast, align 16 - %ld = load i64, i64 addrspace(5)* %cast, align 16 + %a = alloca [3 x ptr], align 16, addrspace(5) + store i64 %v, ptr addrspace(5) %a, align 16 + %ld = load i64, ptr addrspace(5) %a, align 16 ret i64 %ld } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll index b57323a026eb..1214268c65fa 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll @@ -5,22 +5,22 @@ ; GCN-LABEL: {{^}}stored_lds_pointer_value: ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value(float addrspace(5)* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value(ptr addrspace(1) %ptr) #0 { %tmp = alloca float, addrspace(5) - store float 0.0, float addrspace(5)*%tmp - store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr + store float 0.0, ptr addrspace(5) %tmp + store ptr addrspace(5) %tmp, ptr addrspace(1) %ptr ret void } ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset: ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* addrspace(1)* %ptr) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_offset(ptr addrspace(1) %ptr) #0 { %tmp0 = alloca float, addrspace(5) %tmp1 = alloca float, addrspace(5) - store float 0.0, float addrspace(5)*%tmp0 - store float 0.0, float addrspace(5)*%tmp1 - store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(1)* %ptr - store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr + store float 0.0, ptr addrspace(5) %tmp0 + store float 0.0, ptr addrspace(5) %tmp1 + store volatile ptr addrspace(5) %tmp0, ptr addrspace(1) %ptr + store volatile ptr addrspace(5) %tmp1, ptr addrspace(1) %ptr ret void } @@ -29,12 +29,12 @@ define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* a ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; GCN: buffer_store_dword v ; GCN: buffer_store_dword v -define amdgpu_kernel void @stored_lds_pointer_value_gep(float addrspace(5)* addrspace(1)* %ptr, i32 %idx) #0 { +define amdgpu_kernel void @stored_lds_pointer_value_gep(ptr addrspace(1) %ptr, i32 %idx) #0 { bb: %tmp = alloca float, i32 16, addrspace(5) - store float 0.0, float addrspace(5)* %tmp - %tmp2 = getelementptr inbounds float, float addrspace(5)* %tmp, i32 %idx - store float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr + store float 0.0, ptr addrspace(5) %tmp + %tmp2 = getelementptr inbounds float, ptr addrspace(5) %tmp, i32 %idx + store ptr addrspace(5) %tmp2, ptr addrspace(1) %ptr ret void } @@ -46,29 +46,27 @@ bb: ; GCN: buffer_store_dword ; GCN: buffer_store_dword ; GCN: buffer_store_dword -define amdgpu_kernel void @stored_vector_pointer_value(i32 addrspace(5)* addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @stored_vector_pointer_value(ptr addrspace(1) %out, i32 %index) { entry: %tmp0 = alloca [4 x i32], addrspace(5) - %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 0 - %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 1 - %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 2 - %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 %index - store i32 addrspace(5)* %tmp1, i32 addrspace(5)* addrspace(1)* %out + %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 1 + %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 2 + %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp0 + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 %index + store ptr addrspace(5) %tmp1, ptr addrspace(1) %out ret void } ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-NOT: ds_ define amdgpu_kernel void @stored_fi_to_self() #0 { - %tmp = alloca i32 addrspace(5)*, addrspace(5) - store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp - %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)* - store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp + %tmp = alloca ptr addrspace(5), addrspace(5) + store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp + store volatile ptr addrspace(5) %tmp, ptr addrspace(5) %tmp ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll index 3bce73922908..ada1b841cd67 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-strip-abi-opt-attributes.ll @@ -3,23 +3,22 @@ ; This kernel starts with the amdgpu-no-workitem-id-* attributes, but ; need to be removed when these intrinsic uses are introduced. -; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 { -; CHECK: call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() +; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 { +; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() ; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2 ; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2 ; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2 -define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 { +define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 { entry: %tmp = alloca [2 x i32], addrspace(5) - %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1 - store i32 0, i32 addrspace(5)* %tmp1 - store i32 1, i32 addrspace(5)* %tmp2 - %tmp3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in - %tmp4 = load i32, i32 addrspace(5)* %tmp3 - %tmp5 = load volatile i32, i32 addrspace(1)* undef + %tmp2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %tmp2 + %tmp3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in + %tmp4 = load i32, ptr addrspace(5) %tmp3 + %tmp5 = load volatile i32, ptr addrspace(1) undef %tmp6 = add i32 %tmp4, %tmp5 - store i32 %tmp6, i32 addrspace(1)* %out + store i32 %tmp6, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll index bc889d9c467c..83acd5eddc81 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll @@ -7,8 +7,8 @@ target datalayout = "A5" @some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4 @some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4 -@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4 -@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4 +@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4 +@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4 ; This function cannot promote to using LDS because of the size of the ; constant expression use in the function, which was previously not @@ -18,22 +18,21 @@ target datalayout = "A5" ; ASM-LABEL: constant_expression_uses_all_lds: ; ASM: .amdhsa_group_segment_fixed_size 65536 -define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out - store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef + store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef ret void } @@ -45,21 +44,20 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_some_lds: ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} -define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef ret void } @@ -71,47 +69,44 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds: ; ASM: .amdhsa_group_segment_fixed_size 0{{$}} -define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - %gep_dyn_lds = getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0 - store i32 1234, i32* %gep_dyn_lds, align 4 + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4 ret void } -declare void @callee(i8*) +declare void @callee(ptr) ; IR-LABEL: @constant_expression_uses_all_lds_multi_level( ; IR: alloca ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level: ; ASM: .amdhsa_group_segment_fixed_size 65536{{$}} -define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr)) ret void } @@ -121,21 +116,20 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level: ; ASM: .amdhsa_group_segment_fixed_size 4224{{$}} -define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*)) + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr)) ret void } @@ -144,21 +138,20 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level: ; ASM: .amdhsa_group_segment_fixed_size 0{{$}} -define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*)) + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr)) ret void } @@ -168,22 +161,21 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer: ; ASM: .amdhsa_group_segment_fixed_size 4096{{$}} -define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out - store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef + store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef ret void } @@ -195,21 +187,20 @@ entry: ; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer: ; ASM: .group_segment_fixed_size: 65536 -define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 { +define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0 - %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1 - %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2 - %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3 - store i32 9, i32 addrspace(5)* %gep0 - store i32 10, i32 addrspace(5)* %gep1 - store i32 99, i32 addrspace(5)* %gep2 - store i32 43, i32 addrspace(5)* %gep3 - %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx - %load = load i32, i32 addrspace(5)* %arrayidx, align 4 - store i32 %load, i32 addrspace(1)* %out - store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef + %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1 + %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2 + %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3 + store i32 9, ptr addrspace(5) %stack + store i32 10, ptr addrspace(5) %gep1 + store i32 99, ptr addrspace(5) %gep2 + store i32 43, ptr addrspace(5) %gep3 + %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx + %load = load i32, ptr addrspace(5) %arrayidx, align 4 + store i32 %load, ptr addrspace(1) %out + store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll index f56f2e51766c..a982bc855ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll @@ -2,86 +2,86 @@ ; CHECK-LABEL: @branch_ptr_var_same_alloca( -; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}} ; CHECK: if: -; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %a ; CHECK: else: -; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b +; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %b ; CHECK: endif: -; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] -; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 +; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ %arrayidx1, %else ] +; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4 define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %else if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a br label %endif else: - %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %b + %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b br label %endif endif: - %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] - store i32 0, i32 addrspace(5)* %phi.ptr, align 4 + %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, ptr addrspace(5) %phi.ptr, align 4 ret void } ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0( -; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ] +; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ null, %entry ] define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %endif if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a br label %endif endif: - %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ null, %entry ] - store i32 0, i32 addrspace(5)* %phi.ptr, align 4 + %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ null, %entry ] + store i32 0, ptr addrspace(5) %phi.ptr, align 4 ret void } ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1( -; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ] +; CHECK: %phi.ptr = phi ptr addrspace(3) [ null, %entry ], [ %arrayidx0, %if ] define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %endif if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a br label %endif endif: - %phi.ptr = phi i32 addrspace(5)* [ null, %entry ], [ %arrayidx0, %if ] - store i32 0, i32 addrspace(5)* %phi.ptr, align 4 + %phi.ptr = phi ptr addrspace(5) [ null, %entry ], [ %arrayidx0, %if ] + store i32 0, ptr addrspace(5) %phi.ptr, align 4 ret void } ; CHECK-LABEL: @one_phi_value( -; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14 -; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a +; CHECK: [[GEP0:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @one_phi_value.alloca, i32 0, i32 %{{[0-9]+}} +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP0]], i32 0, i32 %a ; CHECK: br label %exit -; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ] -; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4 +; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %entry ] +; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4 define amdgpu_kernel void @one_phi_value(i32 %a) #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a br label %exit exit: - %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %entry ] - store i32 0, i32 addrspace(5)* %phi.ptr, align 4 + %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %entry ] + store i32 0, ptr addrspace(5) %phi.ptr, align 4 ret void } @@ -89,30 +89,30 @@ exit: ; CHECK: %alloca = alloca [64 x i32], align 4 ; CHECK: if: -; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a +; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a ; CHECK: else: -; CHECK: %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer() +; CHECK: %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer() ; CHECK: endif: -; CHECK: %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] -; CHECK: store i32 0, i32 addrspace(5)* %phi.ptr, align 4 +; CHECK: %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ] +; CHECK: store i32 0, ptr addrspace(5) %phi.ptr, align 4 define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) br i1 undef, label %if, label %else if: - %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a + %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a br label %endif else: - %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer() + %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer() br label %endif endif: - %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ] - store i32 0, i32 addrspace(5)* %phi.ptr, align 4 + %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ] + store i32 0, ptr addrspace(5) %phi.ptr, align 4 ret void } @@ -133,12 +133,12 @@ endif: ; CHECK-LABEL: @ptr_induction_var_same_alloca( ; CHECK: %alloca = alloca [64 x i32], align 4 -; CHECK: phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] +; CHECK: phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) - %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2 - %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 48 + %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2 + %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 48 br label %for.body for.cond.cleanup: ; preds = %for.body @@ -146,11 +146,11 @@ for.cond.cleanup: ; preds = %for.body for.body: ; preds = %for.body, %entry %i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %p.08 = phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] - store i32 %i.09, i32 addrspace(5)* %p.08, align 4 - %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1 + %p.08 = phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ] + store i32 %i.09, ptr addrspace(5) %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1 %inc = add nuw nsw i32 %i.09, 1 - %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %arrayidx1 + %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %arrayidx1 br i1 %cmp, label %for.cond.cleanup, label %for.body } @@ -170,14 +170,14 @@ for.body: ; preds = %for.body, %entry ; CHECK-LABEL: @ptr_induction_var_alloca_unknown( ; CHECK: %alloca = alloca [64 x i32], align 4 -; CHECK: %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] -; CHECK: %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call +; CHECK: %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] +; CHECK: %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 { entry: %alloca = alloca [64 x i32], align 4, addrspace(5) - %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2 - %call = tail call i32 addrspace(5)* @get_unknown_pointer() #2 - %cmp.7 = icmp eq i32 addrspace(5)* %arrayidx, %call + %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2 + %call = tail call ptr addrspace(5) @get_unknown_pointer() #2 + %cmp.7 = icmp eq ptr addrspace(5) %arrayidx, %call br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %entry @@ -191,14 +191,14 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo for.body: ; preds = %for.body, %for.body.preheader %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] - %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] - store i32 %i.09, i32 addrspace(5)* %p.08, align 4 - %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1 + %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ] + store i32 %i.09, ptr addrspace(5) %p.08, align 4 + %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1 %inc = add nuw nsw i32 %i.09, 1 - %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call + %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body } -declare i32 addrspace(5)* @get_unknown_pointer() #0 +declare ptr addrspace(5) @get_unknown_pointer() #0 attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll index 1855fdc85970..7eee96762bc0 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll @@ -3,19 +3,18 @@ ; This is just an arbitrary intrinisic that shouldn't ever need to be ; handled to ensure it doesn't crash. -declare void @llvm.stackrestore(i8*) #2 +declare void @llvm.stackrestore(ptr) #2 ; CHECK-LABEL: @try_promote_unhandled_intrinsic( ; CHECK: alloca -; CHECK: call void @llvm.stackrestore(i8* %tmp1) -define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 { +; CHECK: call void @llvm.stackrestore(ptr %tmp) +define amdgpu_kernel void @try_promote_unhandled_intrinsic(ptr addrspace(1) %arg) #2 { bb: %tmp = alloca i32, align 4 - %tmp1 = bitcast i32* %tmp to i8* - %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 - %tmp3 = load i32, i32 addrspace(1)* %tmp2 - store i32 %tmp3, i32* %tmp - call void @llvm.stackrestore(i8* %tmp1) + %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1 + %tmp3 = load i32, ptr addrspace(1) %tmp2 + store i32 %tmp3, ptr %tmp + call void @llvm.stackrestore(ptr %tmp) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll index 7cff51bf0bc7..adabeab37950 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll @@ -11,13 +11,13 @@ ; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0, ; GCN: store_dword v{{.+}}, [[RES]] -; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4 -; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: store <4 x float> , ptr addrspace(5) %alloca, align 4 +; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca ; OPT: %1 = extractelement <4 x float> %0, i32 %sel2 -; OPT: store float %1, float addrspace(1)* %out, align 4 +; OPT: store float %1, ptr addrspace(1) %out, align 4 -define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x float>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -26,10 +26,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 - store <4 x float> , <4 x float> addrspace(5)* %alloca, align 4 - %load = load float, float addrspace(5)* %gep, align 4 - store float %load, float addrspace(1)* %out, align 4 + %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store <4 x float> , ptr addrspace(5) %alloca, align 4 + %load = load float, ptr addrspace(5) %gep, align 4 + store float %load, ptr addrspace(1) %out, align 4 ret void } @@ -46,14 +46,14 @@ entry: ; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]] ; GCN: store_dwordx4 v{{.+}}, -; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca ; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2 -; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca -; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4 -; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4 +; OPT: store <4 x float> %1, ptr addrspace(5) %alloca +; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4 +; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4 -define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x float>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -62,10 +62,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2 - store float 1.0, float addrspace(5)* %gep, align 4 - %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4 - store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4 + %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store float 1.0, ptr addrspace(5) %gep, align 4 + %load = load <4 x float>, ptr addrspace(5) %alloca, align 4 + store <4 x float> %load, ptr addrspace(1) %out, align 4 ret void } @@ -77,13 +77,13 @@ entry: ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: store <4 x half> , <4 x half> addrspace(5)* %alloca, align 2 -; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: store <4 x half> , ptr addrspace(5) %alloca, align 2 +; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca ; OPT: %1 = extractelement <4 x half> %0, i32 %sel2 -; OPT: store half %1, half addrspace(1)* %out, align 2 +; OPT: store half %1, ptr addrspace(1) %out, align 2 -define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x half>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -92,10 +92,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2 - store <4 x half> , <4 x half> addrspace(5)* %alloca, align 2 - %load = load half, half addrspace(5)* %gep, align 2 - store half %load, half addrspace(1)* %out, align 2 + %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store <4 x half> , ptr addrspace(5) %alloca, align 2 + %load = load half, ptr addrspace(5) %gep, align 2 + store half %load, ptr addrspace(1) %out, align 2 ret void } @@ -105,14 +105,14 @@ entry: ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca ; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2 -; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca -; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2 -; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2 +; OPT: store <4 x half> %1, ptr addrspace(5) %alloca +; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2 +; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2 -define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x half>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -121,10 +121,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2 - store half 1.0, half addrspace(5)* %gep, align 4 - %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2 - store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2 + %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store half 1.0, ptr addrspace(5) %gep, align 4 + %load = load <4 x half>, ptr addrspace(5) %alloca, align 2 + store <4 x half> %load, ptr addrspace(1) %out, align 2 ret void } @@ -136,13 +136,13 @@ entry: ; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001 ; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]] -; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: store <4 x i16> , <4 x i16> addrspace(5)* %alloca, align 2 -; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: store <4 x i16> , ptr addrspace(5) %alloca, align 2 +; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca ; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2 -; OPT: store i16 %1, i16 addrspace(1)* %out, align 2 +; OPT: store i16 %1, ptr addrspace(1) %out, align 2 -define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x i16>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -151,10 +151,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2 - store <4 x i16> , <4 x i16> addrspace(5)* %alloca, align 2 - %load = load i16, i16 addrspace(5)* %gep, align 2 - store i16 %load, i16 addrspace(1)* %out, align 2 + %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store <4 x i16> , ptr addrspace(5) %alloca, align 2 + %load = load i16, ptr addrspace(5) %gep, align 2 + store i16 %load, ptr addrspace(1) %out, align 2 ret void } @@ -164,14 +164,14 @@ entry: ; GCN-NOT: buffer_ ; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff -; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2 -; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca +; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 +; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca ; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2 -; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca -; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2 -; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2 +; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca +; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2 +; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2 -define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) { +define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) { entry: %alloca = alloca <4 x i16>, align 16, addrspace(5) %x = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -180,10 +180,10 @@ entry: %c2 = icmp uge i32 %y, 3 %sel1 = select i1 %c1, i32 1, i32 2 %sel2 = select i1 %c2, i32 0, i32 %sel1 - %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2 - store i16 1, i16 addrspace(5)* %gep, align 4 - %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2 - store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2 + %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2 + store i16 1, ptr addrspace(5) %gep, align 4 + %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2 + store <4 x i16> %load, ptr addrspace(1) %out, align 2 ret void } @@ -194,14 +194,12 @@ entry: ; GCN: v_mov_b32_e32 v1, 0 ; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5) -; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)* -; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8 +; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8 define i64 @ptr_alloca_bitcast() { entry: %private_iptr = alloca <2 x i32>, align 8, addrspace(5) - %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)* - %tmp1 = load i64, i64 addrspace(5)* %cast, align 8 + %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8 ret i64 %tmp1 } diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll index 5d35f1d73910..6bac2f92726a 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -2,26 +2,26 @@ ; CHECK-LABEL: @volatile_load( ; CHECK: alloca [4 x i32] -; CHECK: load volatile i32, i32 addrspace(5)* -define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; CHECK: load volatile i32, ptr addrspace(5) +define amdgpu_kernel void @volatile_load(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp - %load = load volatile i32, i32 addrspace(5)* %arrayidx1 - store i32 %load, i32 addrspace(1)* %out + %tmp = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp + %load = load volatile i32, ptr addrspace(5) %arrayidx1 + store i32 %load, ptr addrspace(1) %out ret void } ; CHECK-LABEL: @volatile_store( ; CHECK: alloca [4 x i32] -; CHECK: store volatile i32 %tmp, i32 addrspace(5)* -define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { +; CHECK: store volatile i32 %tmp, ptr addrspace(5) +define amdgpu_kernel void @volatile_store(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) { entry: %stack = alloca [4 x i32], align 4, addrspace(5) - %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp - store volatile i32 %tmp, i32 addrspace(5)* %arrayidx1 + %tmp = load i32, ptr addrspace(1) %in, align 4 + %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp + store volatile i32 %tmp, ptr addrspace(5) %arrayidx1 ret void } @@ -30,15 +30,15 @@ entry: ; CHECK: alloca double ; CHECK: load double ; CHECK: load volatile double -define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 { +define amdgpu_kernel void @volatile_and_non_volatile_load(ptr addrspace(1) nocapture %arg, i32 %arg1) #0 { bb: %tmp = alloca double, align 8, addrspace(5) - store double 0.000000e+00, double addrspace(5)* %tmp, align 8 + store double 0.000000e+00, ptr addrspace(5) %tmp, align 8 - %tmp4 = load double, double addrspace(5)* %tmp, align 8 - %tmp5 = load volatile double, double addrspace(5)* %tmp, align 8 + %tmp4 = load double, ptr addrspace(5) %tmp, align 8 + %tmp5 = load volatile double, ptr addrspace(5) %tmp, align 8 - store double %tmp4, double addrspace(1)* %arg + store double %tmp4, ptr addrspace(1) %arg ret void } diff --git a/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll b/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll index f8a47cdaaaeb..e14ae06b8031 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-promote-alloca-vector-users.ll @@ -4,34 +4,34 @@ ; CHECK-LABEL: @test_insertelement( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0 +; CHECK-NEXT: insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0 define amdgpu_kernel void @test_insertelement() #0 { entry: %alloca = alloca i16, align 4, addrspace(5) - %in = insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0 - store <2 x i16 addrspace(5)*> %in, <2 x i16 addrspace(5)*>* undef, align 4 + %in = insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0 + store <2 x ptr addrspace(5)> %in, ptr undef, align 4 ret void } ; CHECK-LABEL: @test_insertvalue( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0 +; CHECK-NEXT: insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0 define amdgpu_kernel void @test_insertvalue() #0 { entry: %alloca = alloca i16, align 4, addrspace(5) - %in = insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0 - store { i16 addrspace(5)* } %in, { i16 addrspace(5)* }* undef, align 4 + %in = insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0 + store { ptr addrspace(5) } %in, ptr undef, align 4 ret void } ; CHECK-LABEL: @test_insertvalue_array( ; CHECK: %alloca = alloca i16 -; CHECK-NEXT: insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0 +; CHECK-NEXT: insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0 define amdgpu_kernel void @test_insertvalue_array() #0 { entry: %alloca = alloca i16, align 4, addrspace(5) - %in = insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0 - store [2 x i16 addrspace(5)*] %in, [2 x i16 addrspace(5)*]* undef, align 4 + %in = insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0 + store [2 x ptr addrspace(5)] %in, ptr undef, align 4 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll index d0de6af1a33e..d0107ec23927 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-atomic.ll @@ -5,61 +5,58 @@ ; OPT-LABEL: @vector_alloca_not_atomic( ; ; OPT: extractelement <3 x i32> , i64 %index -define amdgpu_kernel void @vector_alloca_not_atomic(i32 addrspace(1)* %out, i64 %index) { +define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) - %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1 - %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2 - store i32 0, i32 addrspace(5)* %a0 - store i32 1, i32 addrspace(5)* %a1 - store i32 2, i32 addrspace(5)* %a2 - %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index - %data = load i32, i32 addrspace(5)* %tmp - store i32 %data, i32 addrspace(1)* %out + %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 + %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %a1 + store i32 2, ptr addrspace(5) %a2 + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %data = load i32, ptr addrspace(5) %tmp + store i32 %data, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_alloca_atomic_read( ; ; OPT: alloca [3 x i32] -; OPT: store i32 0, i32 addrspace(5)* -; OPT: store i32 1, i32 addrspace(5)* -; OPT: store i32 2, i32 addrspace(5)* -; OPT: load atomic i32, i32 addrspace(5)* -define amdgpu_kernel void @vector_alloca_atomic_read(i32 addrspace(1)* %out, i64 %index) { +; OPT: store i32 0, ptr addrspace(5) +; OPT: store i32 1, ptr addrspace(5) +; OPT: store i32 2, ptr addrspace(5) +; OPT: load atomic i32, ptr addrspace(5) +define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) - %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1 - %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2 - store i32 0, i32 addrspace(5)* %a0 - store i32 1, i32 addrspace(5)* %a1 - store i32 2, i32 addrspace(5)* %a2 - %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index - %data = load atomic i32, i32 addrspace(5)* %tmp acquire, align 4 - store i32 %data, i32 addrspace(1)* %out + %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 + %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2 + store i32 0, ptr addrspace(5) %alloca + store i32 1, ptr addrspace(5) %a1 + store i32 2, ptr addrspace(5) %a2 + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4 + store i32 %data, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_alloca_atomic_write( ; ; OPT: alloca [3 x i32] -; OPT: store atomic i32 0, i32 addrspace(5) -; OPT: store atomic i32 1, i32 addrspace(5) -; OPT: store atomic i32 2, i32 addrspace(5) -; OPT: load i32, i32 addrspace(5)* -define amdgpu_kernel void @vector_alloca_atomic_write(i32 addrspace(1)* %out, i64 %index) { +; OPT: store atomic i32 0, ptr addrspace(5) +; OPT: store atomic i32 1, ptr addrspace(5) +; OPT: store atomic i32 2, ptr addrspace(5) +; OPT: load i32, ptr addrspace(5) +define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) { entry: %alloca = alloca [3 x i32], addrspace(5) - %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0 - %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1 - %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2 - store atomic i32 0, i32 addrspace(5)* %a0 release, align 4 - store atomic i32 1, i32 addrspace(5)* %a1 release, align 4 - store atomic i32 2, i32 addrspace(5)* %a2 release, align 4 - %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index - %data = load i32, i32 addrspace(5)* %tmp - store i32 %data, i32 addrspace(1)* %out + %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1 + %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2 + store atomic i32 0, ptr addrspace(5) %alloca release, align 4 + store atomic i32 1, ptr addrspace(5) %a1 release, align 4 + store atomic i32 2, ptr addrspace(5) %a2 release, align 4 + %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index + %data = load i32, ptr addrspace(5) %tmp + store i32 %data, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll index 029082e5b2b4..b1d6deadc907 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -7,7 +7,7 @@ target datalayout = "A5" ; OPT-LABEL: @vector_read_alloca_bitcast( ; OPT-NOT: alloca ; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4 +; OPT-NEXT: store i32 %0, ptr addrspace(1) %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast: ; GCN-ALLOCA-COUNT-4: buffer_store_dword @@ -24,20 +24,19 @@ target datalayout = "A5" ; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)* - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp2, ptr addrspace(1) %out ret void } @@ -45,7 +44,7 @@ entry: ; OPT-NOT: alloca ; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index ; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index -; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align +; OPT-NEXT: store i32 %1, ptr addrspace(1) %out, align ; GCN-LABEL: {{^}}vector_write_alloca_bitcast: ; GCN-ALLOCA-COUNT-5: buffer_store_dword @@ -55,22 +54,21 @@ entry: ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)* - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 0, i32 addrspace(5)* %y - store i32 0, i32 addrspace(5)* %z - store i32 0, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index - store i32 1, i32 addrspace(5)* %tmp1 - %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index - %tmp3 = load i32, i32 addrspace(5)* %tmp2 - store i32 %tmp3, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 0, ptr addrspace(5) %y + store i32 0, ptr addrspace(5) %z + store i32 0, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index + store i32 1, ptr addrspace(5) %tmp1 + %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index + %tmp3 = load i32, ptr addrspace(5) %tmp2 + store i32 %tmp3, ptr addrspace(1) %out ret void } @@ -78,7 +76,7 @@ entry: ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10 +; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> ; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 @@ -106,31 +104,28 @@ entry: ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) { +define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) { bb: %tmp = alloca [6 x float], align 4, addrspace(5) - %tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2 br label %bb2 bb2: ; preds = %bb2, %bb %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4 - %tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)* - %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4 + %tmp7 = load i32, ptr addrspace(1) %tmp5, align 4 %tmp8 = trunc i32 %tmp3 to i16 %tmp9 = urem i16 %tmp8, 6 %tmp10 = zext i16 %tmp9 to i32 - %tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10 - %tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)* - store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4 + %tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10 + store i32 %tmp7, ptr addrspace(5) %tmp11, align 4 %tmp13 = add nuw nsw i32 %tmp3, 1 %tmp14 = icmp eq i32 %tmp13, 1000 br i1 %tmp14, label %.preheader, label %bb2 bb15: ; preds = %.preheader - call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2 ret void .preheader: ; preds = %.preheader, %bb2 @@ -139,13 +134,11 @@ bb15: ; preds = %.preheader %tmp18 = urem i16 %tmp17, 6 %tmp19 = sub nuw nsw i16 5, %tmp18 %tmp20 = zext i16 %tmp19 to i32 - %tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20 - %tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)* - %tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4 + %tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20 + %tmp23 = load i32, ptr addrspace(5) %tmp21, align 4 %tmp24 = zext i32 %tmp16 to i64 - %tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24 - %tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)* - store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4 + %tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24 + store i32 %tmp23, ptr addrspace(1) %tmp25, align 4 %tmp27 = add nuw nsw i32 %tmp16, 1 %tmp28 = icmp eq i32 %tmp27, 1000 br i1 %tmp28, label %bb15, label %.preheader @@ -155,7 +148,7 @@ bb15: ; preds = %.preheader ; OPT-NOT: alloca ; OPT: bb2: ; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10 +; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10 ; OPT: .preheader: ; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> ; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20 @@ -172,31 +165,28 @@ bb15: ; preds = %.preheader ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) { +define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) { bb: %tmp = alloca [6 x double], align 8, addrspace(5) - %tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2 br label %bb2 bb2: ; preds = %bb2, %bb %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4 - %tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)* - %tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8 + %tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4 + %tmp7 = load i64, ptr addrspace(1) %tmp5, align 8 %tmp8 = trunc i32 %tmp3 to i16 %tmp9 = urem i16 %tmp8, 6 %tmp10 = zext i16 %tmp9 to i32 - %tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10 - %tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)* - store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8 + %tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10 + store i64 %tmp7, ptr addrspace(5) %tmp11, align 8 %tmp13 = add nuw nsw i32 %tmp3, 1 %tmp14 = icmp eq i32 %tmp13, 1000 br i1 %tmp14, label %.preheader, label %bb2 bb15: ; preds = %.preheader - call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2 ret void .preheader: ; preds = %.preheader, %bb2 @@ -205,13 +195,11 @@ bb15: ; preds = %.preheader %tmp18 = urem i16 %tmp17, 6 %tmp19 = sub nuw nsw i16 5, %tmp18 %tmp20 = zext i16 %tmp19 to i32 - %tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20 - %tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)* - %tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8 + %tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20 + %tmp23 = load i64, ptr addrspace(5) %tmp21, align 8 %tmp24 = zext i32 %tmp16 to i64 - %tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24 - %tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)* - store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8 + %tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24 + store i64 %tmp23, ptr addrspace(1) %tmp25, align 8 %tmp27 = add nuw nsw i32 %tmp16, 1 %tmp28 = icmp eq i32 %tmp27, 1000 br i1 %tmp28, label %bb15, label %.preheader @@ -237,29 +225,28 @@ bb15: ; preds = %.preheader ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) { +define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) { bb: %tmp = alloca [6 x i64], align 8, addrspace(5) - %tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2 br label %bb2 bb2: ; preds = %bb2, %bb %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ] %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4 - %tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8 + %tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4 + %tmp6 = load i64, ptr addrspace(1) %tmp5, align 8 %tmp7 = trunc i32 %tmp3 to i16 %tmp8 = urem i16 %tmp7, 6 %tmp9 = zext i16 %tmp8 to i32 - %tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9 - store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8 + %tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9 + store i64 %tmp6, ptr addrspace(5) %tmp10, align 8 %tmp11 = add nuw nsw i32 %tmp3, 1 %tmp12 = icmp eq i32 %tmp11, 1000 br i1 %tmp12, label %.preheader, label %bb2 bb13: ; preds = %.preheader - call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2 + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2 ret void .preheader: ; preds = %.preheader, %bb2 @@ -268,11 +255,11 @@ bb13: ; preds = %.preheader %tmp16 = urem i16 %tmp15, 6 %tmp17 = sub nuw nsw i16 5, %tmp16 %tmp18 = zext i16 %tmp17 to i32 - %tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18 - %tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8 + %tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18 + %tmp20 = load i64, ptr addrspace(5) %tmp19, align 8 %tmp21 = zext i32 %tmp14 to i64 - %tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21 - store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8 + %tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21 + store i64 %tmp20, ptr addrspace(1) %tmp22, align 8 %tmp23 = add nuw nsw i32 %tmp14, 1 %tmp24 = icmp eq i32 %tmp23, 1000 br i1 %tmp24, label %bb13, label %.preheader @@ -282,27 +269,26 @@ bb13: ; preds = %.preheader ; OPT-LABEL: @vector_read_alloca_bitcast_assume( ; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 +; OPT: store i32 %0, ptr addrspace(1) %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume: ; GCN-COUNT-4: buffer_store_dword -define amdgpu_kernel void @vector_read_alloca_bitcast_assume(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)* - %cmp = icmp ne i32 addrspace(5)* %x, null + %cmp = icmp ne ptr addrspace(5) %tmp, null call void @llvm.assume(i1 %cmp) - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp2, ptr addrspace(1) %out ret void } @@ -310,7 +296,7 @@ entry: ; OPT-NOT: alloca ; OPT: %0 = extractelement <4 x i32> , i32 %index ; OPT-NEXT: %add2 = add nuw nsw i32 %0, 1 -; OPT-NEXT: store i32 %add2, i32 addrspace(1)* %out, align 4 +; OPT-NEXT: store i32 %add2, ptr addrspace(1) %out, align 4 ; GCN-LABEL: {{^}}vector_read_alloca_multiuse: ; GCN-ALLOCA-COUNT-4: buffer_store_dword @@ -328,31 +314,29 @@ entry: ; GCN-PROMOTE: ScratchSize: 0 -define amdgpu_kernel void @vector_read_alloca_multiuse(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %b = bitcast [4 x i32] addrspace(5)* %tmp to float addrspace(5)* - %x = bitcast float addrspace(5)* %b to i32 addrspace(5)* - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - %tmp3 = load i32, i32 addrspace(5)* %x - %tmp4 = load i32, i32 addrspace(5)* %y + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + %tmp3 = load i32, ptr addrspace(5) %tmp + %tmp4 = load i32, ptr addrspace(5) %y %add1 = add i32 %tmp2, %tmp3 %add2 = add i32 %add1, %tmp4 - store i32 %add2, i32 addrspace(1)* %out + store i32 %add2, ptr addrspace(1) %out ret void } ; OPT-LABEL: @bitcast_vector_to_vector( ; OPT-NOT: alloca -; OPT: store <4 x i32> , <4 x i32> addrspace(1)* %out, align 16 +; OPT: store <4 x i32> , ptr addrspace(1) %out, align 16 ; GCN-LABEL: {{^}}bitcast_vector_to_vector: ; GCN: v_mov_b32_e32 v0, 1 @@ -362,19 +346,18 @@ entry: ; GCN: ScratchSize: 0 -define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out) { .entry: %alloca = alloca <4 x float>, align 16, addrspace(5) - %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)* - store <4 x i32> , <4 x i32> addrspace(5)* %cast - %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16 - store <4 x i32> %load, <4 x i32> addrspace(1)* %out + store <4 x i32> , ptr addrspace(5) %alloca + %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16 + store <4 x i32> %load, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_bitcast_from_alloca_array( ; OPT-NOT: alloca -; OPT: store <4 x i32> , <4 x i32> addrspace(1)* %out, align 16 +; OPT: store <4 x i32> , ptr addrspace(1) %out, align 16 ; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array: ; GCN: v_mov_b32_e32 v0, 1 @@ -384,26 +367,24 @@ define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out ; GCN: ScratchSize: 0 -define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out) { .entry: %alloca = alloca [4 x float], align 16, addrspace(5) - %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)* - store <4 x i32> , <4 x i32> addrspace(5)* %cast - %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16 - store <4 x i32> %load, <4 x i32> addrspace(1)* %out + store <4 x i32> , ptr addrspace(5) %alloca + %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16 + store <4 x i32> %load, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array( ; OPT-NOT: alloca -; OPT: %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0 -; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1 -; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2 -; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3 -; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4 +; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 +; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1 +; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 +; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2 +; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 +; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3 +; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 ; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array: ; GCN: v_mov_b32_e32 v0, 1 @@ -413,26 +394,23 @@ define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace( ; GCN: ScratchSize: 0 -define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out) { -.entry: +define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out) { %alloca = alloca [4 x float], align 16, addrspace(5) - %cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)* - store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast - %load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16 - store [4 x i32] %load, [4 x i32] addrspace(1)* %out + store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca + %load = load [4 x i32], ptr addrspace(5) %alloca, align 16 + store [4 x i32] %load, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array( ; OPT-NOT: alloca -; OPT: %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0 -; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1 -; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2 -; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3 -; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4 +; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 +; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1 +; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 +; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2 +; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 +; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3 +; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 ; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array: ; GCN: v_mov_b32_e32 v0, 1 @@ -444,18 +422,16 @@ define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] a %struct.v4 = type { i32, i32, i32, i32 } -define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out) { -.entry: +define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out) { %alloca = alloca [4 x float], align 16, addrspace(5) - %cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)* - store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast - %load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16 - store %struct.v4 %load, %struct.v4 addrspace(1)* %out + store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca + %load = load %struct.v4, ptr addrspace(5) %alloca, align 16 + store %struct.v4 %load, ptr addrspace(1) %out ret void } -declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture) +declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) -declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture) +declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) declare void @llvm.assume(i1) diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll index 5a0c3666795c..1060ff777c31 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-limits.ll @@ -8,14 +8,13 @@ target datalayout = "A5" ; OPT: <8 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <8 x i64> -define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @alloca_8xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 { entry: %tmp = alloca [8 x i64], addrspace(5) - %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } @@ -24,14 +23,13 @@ entry: ; OPT-NOT: <9 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i64> -define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 { +define amdgpu_kernel void @alloca_9xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 { entry: %tmp = alloca [9 x i64], addrspace(5) - %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } @@ -40,14 +38,13 @@ entry: ; OPT: <16 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <16 x i64> -define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @alloca_16xi64_max512(ptr addrspace(1) %out, i32 %index) #1 { entry: %tmp = alloca [16 x i64], addrspace(5) - %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [16 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } @@ -56,14 +53,13 @@ entry: ; OPT-NOT: <17 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <17 x i64> -define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @alloca_17xi64_max512(ptr addrspace(1) %out, i32 %index) #1 { entry: %tmp = alloca [17 x i64], addrspace(5) - %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [17 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } @@ -72,14 +68,13 @@ entry: ; OPT-NOT: <9 x i128> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i128> -define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 { +define amdgpu_kernel void @alloca_9xi128_max512(ptr addrspace(1) %out, i32 %index) #1 { entry: %tmp = alloca [9 x i128], addrspace(5) - %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 - store i128 0, i128 addrspace(5)* %x - %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i128, i128 addrspace(5)* %tmp1 - store i128 %tmp2, i128 addrspace(1)* %out + store i128 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i128, ptr addrspace(5) %tmp1 + store i128 %tmp2, ptr addrspace(1) %out ret void } @@ -88,14 +83,13 @@ entry: ; OPT: <9 x i128> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i128> -define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +define amdgpu_kernel void @alloca_9xi128_max256(ptr addrspace(1) %out, i32 %index) #2 { entry: %tmp = alloca [9 x i128], addrspace(5) - %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0 - store i128 0, i128 addrspace(5)* %x - %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i128, i128 addrspace(5)* %tmp1 - store i128 %tmp2, i128 addrspace(1)* %out + store i128 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i128, ptr addrspace(5) %tmp1 + store i128 %tmp2, ptr addrspace(1) %out ret void } @@ -104,14 +98,13 @@ entry: ; OPT: <16 x i128> ; LIMIT32: alloca ; LIMIT32-NOT: <16 x i128> -define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 { +define amdgpu_kernel void @alloca_16xi128_max256(ptr addrspace(1) %out, i32 %index) #2 { entry: %tmp = alloca [16 x i128], addrspace(5) - %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0 - store i128 0, i128 addrspace(5)* %x - %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i128, i128 addrspace(5)* %tmp1 - store i128 %tmp2, i128 addrspace(1)* %out + store i128 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [16 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i128, ptr addrspace(5) %tmp1 + store i128 %tmp2, ptr addrspace(1) %out ret void } @@ -120,14 +113,13 @@ entry: ; OPT-NOT: <9 x i256> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i256> -define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 { +define amdgpu_kernel void @alloca_9xi256_max256(ptr addrspace(1) %out, i32 %index) #2 { entry: %tmp = alloca [9 x i256], addrspace(5) - %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0 - store i256 0, i256 addrspace(5)* %x - %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i256, i256 addrspace(5)* %tmp1 - store i256 %tmp2, i256 addrspace(1)* %out + store i256 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i256, ptr addrspace(5) %tmp1 + store i256 %tmp2, ptr addrspace(1) %out ret void } @@ -136,14 +128,13 @@ entry: ; OPT: <9 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i64> -define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +define amdgpu_kernel void @alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 { entry: %tmp = alloca [9 x i64], addrspace(5) - %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } @@ -152,14 +143,13 @@ entry: ; OPT-NOT: <9 x i64> ; LIMIT32: alloca ; LIMIT32-NOT: <9 x i64> -define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 { +define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 { entry: %tmp = alloca [9 x i64], addrspace(5) - %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0 - store i64 0, i64 addrspace(5)* %x - %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i64, i64 addrspace(5)* %tmp1 - store i64 %tmp2, i64 addrspace(1)* %out + store i64 0, ptr addrspace(5) %tmp + %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i64, ptr addrspace(5) %tmp1 + store i64 %tmp2, ptr addrspace(1) %out ret void } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll index ca81759f80f9..20e544dc4b5a 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll @@ -9,7 +9,7 @@ target datalayout = "A5" ; OPT-LABEL: @vector_read( ; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 +; OPT: store i32 %0, ptr addrspace(1) %out, align 4 ; FUNC-LABEL: {{^}}vector_read: ; EG: MOV @@ -17,27 +17,26 @@ target datalayout = "A5" ; EG: MOV ; EG: MOV ; EG: MOVA_INT -define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) { +define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp2, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_write( ; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index ; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index -; OPT: store i32 %1, i32 addrspace(1)* %out, align 4 +; OPT: store i32 %1, ptr addrspace(1) %out, align 4 ; FUNC-LABEL: {{^}}vector_write: ; EG: MOV @@ -46,91 +45,83 @@ entry: ; EG: MOV ; EG: MOVA_INT ; EG: MOVA_INT -define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 0, i32 addrspace(5)* %y - store i32 0, i32 addrspace(5)* %z - store i32 0, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index - store i32 1, i32 addrspace(5)* %tmp1 - %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index - %tmp3 = load i32, i32 addrspace(5)* %tmp2 - store i32 %tmp3, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 0, ptr addrspace(5) %y + store i32 0, ptr addrspace(5) %z + store i32 0, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index + store i32 1, ptr addrspace(5) %tmp1 + %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index + %tmp3 = load i32, ptr addrspace(5) %tmp2 + store i32 %tmp3, ptr addrspace(1) %out ret void } ; This test should be optimize to: -; store i32 0, i32 addrspace(1)* %out +; store i32 0, ptr addrspace(1) %out ; OPT-LABEL: @bitcast_gep( -; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4 +; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4 ; FUNC-LABEL: {{^}}bitcast_gep: ; EG: STORE_RAW -define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 0, i32 addrspace(5)* %y - store i32 0, i32 addrspace(5)* %z - store i32 0, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)* - %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0 - %tmp4 = load i32, i32 addrspace(5)* %tmp3 - store i32 %tmp4, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 0, ptr addrspace(5) %y + store i32 0, ptr addrspace(5) %z + store i32 0, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %tmp4 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp4, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_read_bitcast_gep( ; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 -define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) { +; OPT: store i32 %0, ptr addrspace(1) %out, align 4 +define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)* - store float 1.0, float addrspace(5)* %bc - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out + %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store float 1.0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp2, ptr addrspace(1) %out ret void } ; OPT-LABEL: @vector_read_bitcast_alloca( ; OPT: %0 = extractelement <4 x float> , i32 %index -; OPT: store float %0, float addrspace(1)* %out, align 4 -define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) { +; OPT: store float %0, ptr addrspace(1) %out, align 4 +define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)* - %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0 - %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1 - %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2 - %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3 - store float 0.0, float addrspace(5)* %x - store float 1.0, float addrspace(5)* %y - store float 2.0, float addrspace(5)* %z - store float 4.0, float addrspace(5)* %w - %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index - %tmp2 = load float, float addrspace(5)* %tmp1 - store float %tmp2, float addrspace(1)* %out + %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3 + store float 0.0, ptr addrspace(5) %tmp + store float 1.0, ptr addrspace(5) %y + store float 2.0, ptr addrspace(5) %z + store float 4.0, ptr addrspace(5) %w + %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load float, ptr addrspace(5) %tmp1 + store float %tmp2, ptr addrspace(1) %out ret void } @@ -138,20 +129,19 @@ entry: ; OPT-LABEL: @vector_read_with_local_arg( ; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT: store i32 %0, i32 addrspace(1)* %out, align 4 -define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) { +; OPT: store i32 %0, ptr addrspace(1) %out, align 4 +define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) { entry: %tmp = alloca [4 x i32], addrspace(5) - %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0 - %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3 - store i32 0, i32 addrspace(5)* %x - store i32 1, i32 addrspace(5)* %y - store i32 2, i32 addrspace(5)* %z - store i32 3, i32 addrspace(5)* %w - %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index - %tmp2 = load i32, i32 addrspace(5)* %tmp1 - store i32 %tmp2, i32 addrspace(1)* %out + %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmp + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index + %tmp2 = load i32, ptr addrspace(5) %tmp1 + store i32 %tmp2, ptr addrspace(1) %out ret void }