AMDGPU: Convert promote alloca tests to opaque pointers
This commit is contained in:
parent
b3df889b71
commit
50caf6936b
@ -6,45 +6,43 @@
|
||||
; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
|
||||
; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
|
||||
|
||||
define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4
|
||||
|
||||
define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
|
||||
define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -52,69 +50,66 @@ entry:
|
||||
; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
|
||||
; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
|
||||
|
||||
define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
|
||||
define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @occupancy_0(
|
||||
; CI-NOT: alloca [5 x i32]
|
||||
; SI: alloca [5 x i32]
|
||||
define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
|
||||
define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @occupancy_max(
|
||||
; CI-NOT: alloca [5 x i32]
|
||||
; SI: alloca [5 x i32]
|
||||
define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
|
||||
define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -122,25 +117,24 @@ entry:
|
||||
; CI-LABEL: @occupancy_6(
|
||||
; SI: alloca
|
||||
; CI-NOT: alloca
|
||||
define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
||||
define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
|
||||
entry:
|
||||
%stack = alloca [42 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -148,25 +142,24 @@ entry:
|
||||
; SICI: alloca [43 x i8]
|
||||
; GFX10PLUS-NOT: alloca
|
||||
|
||||
define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
||||
define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
|
||||
entry:
|
||||
%stack = alloca [43 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -174,25 +167,24 @@ entry:
|
||||
; CI-LABEL: @occupancy_8(
|
||||
; SI: alloca
|
||||
; CI-NOT: alloca
|
||||
define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
||||
define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
|
||||
entry:
|
||||
%stack = alloca [32 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -200,25 +192,24 @@ entry:
|
||||
; SICI: alloca [33 x i8]
|
||||
; GFX10PLUS-NOT: alloca
|
||||
|
||||
define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
||||
define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
|
||||
entry:
|
||||
%stack = alloca [33 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -226,25 +217,24 @@ entry:
|
||||
; CI-LABEL: @occupancy_9(
|
||||
; SI: alloca
|
||||
; CI-NOT: alloca
|
||||
define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
||||
define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
|
||||
entry:
|
||||
%stack = alloca [28 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -252,25 +242,24 @@ entry:
|
||||
; SICI: alloca [29 x i8]
|
||||
; GFX10PLUS-NOT: alloca
|
||||
|
||||
define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
||||
define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
|
||||
entry:
|
||||
%stack = alloca [29 x i8], align 4, addrspace(5)
|
||||
%tmp = load i8, i8 addrspace(1)* %in, align 1
|
||||
%tmp = load i8, ptr addrspace(1) %in, align 1
|
||||
%tmp4 = sext i8 %tmp to i64
|
||||
%arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
|
||||
store i8 4, i8 addrspace(5)* %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
|
||||
%tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
|
||||
%arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
|
||||
store i8 4, ptr addrspace(5) %arrayidx1, align 1
|
||||
%arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
|
||||
%tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
|
||||
%tmp5 = sext i8 %tmp1 to i64
|
||||
%arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
|
||||
store i8 5, i8 addrspace(5)* %arrayidx3, align 1
|
||||
%arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 0
|
||||
%tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
|
||||
store i8 %tmp2, i8 addrspace(1)* %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
|
||||
store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
|
||||
%arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
|
||||
store i8 5, ptr addrspace(5) %arrayidx3, align 1
|
||||
%tmp2 = load i8, ptr addrspace(5) %stack, align 1
|
||||
store i8 %tmp2, ptr addrspace(1) %out, align 1
|
||||
%arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1
|
||||
%tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
|
||||
%arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
|
||||
store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -18,44 +18,40 @@ define amdgpu_vs void @promote_1d_aggr() #0 {
|
||||
; CHECK-LABEL: @promote_1d_aggr(
|
||||
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 0
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], [1 x float] addrspace(1)* [[FOO2]], align 4
|
||||
; CHECK-NEXT: store [1 x float] [[FOO3]], [1 x float] addrspace(5)* [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], [1 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = load float, float addrspace(5)* [[FOO5]], align 4
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
|
||||
; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
|
||||
; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
|
||||
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
|
||||
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
|
||||
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
|
||||
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
|
||||
; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
|
||||
; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
|
||||
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%i = alloca i32, addrspace(5)
|
||||
%f1 = alloca [1 x float], addrspace(5)
|
||||
%foo = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1
|
||||
%foo1 = load i32, i32 addrspace(1)* %foo
|
||||
store i32 %foo1, i32 addrspace(5)* %i
|
||||
%foo2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0
|
||||
%foo3 = load [1 x float], [1 x float] addrspace(1)* %foo2
|
||||
store [1 x float] %foo3, [1 x float] addrspace(5)* %f1
|
||||
%foo4 = load i32, i32 addrspace(5)* %i
|
||||
%foo5 = getelementptr [1 x float], [1 x float] addrspace(5)* %f1, i32 0, i32 %foo4
|
||||
%foo6 = load float, float addrspace(5)* %foo5
|
||||
%foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
|
||||
%foo1 = load i32, ptr addrspace(1) %foo
|
||||
store i32 %foo1, ptr addrspace(5) %i
|
||||
%foo3 = load [1 x float], ptr addrspace(1) @block
|
||||
store [1 x float] %foo3, ptr addrspace(5) %f1
|
||||
%foo4 = load i32, ptr addrspace(5) %i
|
||||
%foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
||||
%foo6 = load float, ptr addrspace(5) %foo5
|
||||
%foo7 = alloca <4 x float>, addrspace(5)
|
||||
%foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
|
||||
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
|
||||
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
|
||||
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
|
||||
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
|
||||
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
|
||||
%foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
|
||||
store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
|
||||
store <4 x float> %foo12, ptr addrspace(1) @pv
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -66,44 +62,36 @@ define amdgpu_vs void @promote_store_aggr() #0 {
|
||||
; CHECK-LABEL: @promote_store_aggr(
|
||||
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK2:%.*]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 0
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0
|
||||
; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float> addrspace(5)* [[TMP1]], align 8
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP4]], align 8
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1
|
||||
; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float> addrspace(5)* [[TMP4]], align 8
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float] addrspace(5)* [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1
|
||||
; CHECK-NEXT: store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
|
||||
; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* [[FOO8]], align 16
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
|
||||
; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 8
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
|
||||
; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 8
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
|
||||
; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
|
||||
; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%i = alloca i32, addrspace(5)
|
||||
%f1 = alloca [2 x float], addrspace(5)
|
||||
%foo = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0
|
||||
%foo1 = load i32, i32 addrspace(1)* %foo
|
||||
store i32 %foo1, i32 addrspace(5)* %i
|
||||
%foo2 = load i32, i32 addrspace(5)* %i
|
||||
%foo1 = load i32, ptr addrspace(1) @block2
|
||||
store i32 %foo1, ptr addrspace(5) %i
|
||||
%foo2 = load i32, ptr addrspace(5) %i
|
||||
%foo3 = sitofp i32 %foo2 to float
|
||||
%foo4 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 0
|
||||
store float %foo3, float addrspace(5)* %foo4
|
||||
%foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 1
|
||||
store float 2.000000e+00, float addrspace(5)* %foo5
|
||||
%foo6 = load [2 x float], [2 x float] addrspace(5)* %f1
|
||||
%foo7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1
|
||||
store [2 x float] %foo6, [2 x float] addrspace(1)* %foo7
|
||||
%foo8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
|
||||
store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* %foo8
|
||||
store float %foo3, ptr addrspace(5) %f1
|
||||
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
|
||||
store float 2.000000e+00, ptr addrspace(5) %foo5
|
||||
%foo6 = load [2 x float], ptr addrspace(5) %f1
|
||||
%foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
|
||||
store [2 x float] %foo6, ptr addrspace(1) %foo7
|
||||
store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -114,46 +102,41 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
|
||||
; CHECK-LABEL: @promote_load_from_store_aggr(
|
||||
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK3]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 0
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], [2 x float] addrspace(1)* [[FOO2]], align 4
|
||||
; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float] addrspace(5)* [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
|
||||
; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
|
||||
; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
|
||||
; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
|
||||
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0
|
||||
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
|
||||
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
|
||||
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
|
||||
; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
|
||||
; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
|
||||
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
|
||||
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
|
||||
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
|
||||
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
|
||||
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%i = alloca i32, addrspace(5)
|
||||
%f1 = alloca [2 x float], addrspace(5)
|
||||
%foo = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1
|
||||
%foo1 = load i32, i32 addrspace(1)* %foo
|
||||
store i32 %foo1, i32 addrspace(5)* %i
|
||||
%foo2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0
|
||||
%foo3 = load [2 x float], [2 x float] addrspace(1)* %foo2
|
||||
store [2 x float] %foo3, [2 x float] addrspace(5)* %f1
|
||||
%foo4 = load i32, i32 addrspace(5)* %i
|
||||
%foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 %foo4
|
||||
%foo6 = load float, float addrspace(5)* %foo5
|
||||
%foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
|
||||
%foo1 = load i32, ptr addrspace(1) %foo
|
||||
store i32 %foo1, ptr addrspace(5) %i
|
||||
%foo3 = load [2 x float], ptr addrspace(1) @block3
|
||||
store [2 x float] %foo3, ptr addrspace(5) %f1
|
||||
%foo4 = load i32, ptr addrspace(5) %i
|
||||
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
||||
%foo6 = load float, ptr addrspace(5) %foo5
|
||||
%foo7 = alloca <4 x float>, addrspace(5)
|
||||
%foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
|
||||
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
|
||||
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
|
||||
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
|
||||
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
|
||||
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
|
||||
%foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
|
||||
store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
|
||||
store <4 x float> %foo12, ptr addrspace(1) @pv
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -163,70 +146,61 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
|
||||
define amdgpu_ps void @promote_double_aggr() #0 {
|
||||
; CHECK-LABEL: @promote_double_aggr(
|
||||
; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load double, double addrspace(1)* [[FOO]], align 8
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load double, double addrspace(1)* [[FOO2]], align 8
|
||||
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
|
||||
; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
|
||||
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
|
||||
; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
|
||||
; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
|
||||
; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
|
||||
; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double] addrspace(5)* [[S]], align 8
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP1]], align 16
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP4]], align 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
|
||||
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]]
|
||||
; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP7]], align 16
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0
|
||||
; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double> addrspace(5)* [[TMP7]], align 16
|
||||
; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP10]], align 16
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
|
||||
; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP13]], align 16
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1
|
||||
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]]
|
||||
; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
|
||||
; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
|
||||
; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
|
||||
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
|
||||
; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
|
||||
; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
|
||||
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
|
||||
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
|
||||
; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
|
||||
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
|
||||
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
|
||||
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
|
||||
; CHECK-NEXT: store <4 x float> [[FOO21]], <4 x float> addrspace(1)* @frag_color, align 16
|
||||
; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s = alloca [2 x double], addrspace(5)
|
||||
%foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
|
||||
%foo1 = load double, double addrspace(1)* %foo
|
||||
%foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
|
||||
%foo3 = load double, double addrspace(1)* %foo2
|
||||
%foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
|
||||
%foo1 = load double, ptr addrspace(1) %foo
|
||||
%foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
|
||||
%foo3 = load double, ptr addrspace(1) %foo2
|
||||
%foo4 = insertvalue [2 x double] undef, double %foo1, 0
|
||||
%foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
|
||||
store [2 x double] %foo5, [2 x double] addrspace(5)* %s
|
||||
%foo6 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
|
||||
%foo7 = load double, double addrspace(5)* %foo6
|
||||
%foo8 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
|
||||
%foo9 = load double, double addrspace(5)* %foo8
|
||||
store [2 x double] %foo5, ptr addrspace(5) %s
|
||||
%foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
||||
%foo7 = load double, ptr addrspace(5) %foo6
|
||||
%foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
||||
%foo9 = load double, ptr addrspace(5) %foo8
|
||||
%foo10 = fadd double %foo7, %foo9
|
||||
%foo11 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
|
||||
store double %foo10, double addrspace(5)* %foo11
|
||||
%foo12 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
|
||||
%foo13 = load double, double addrspace(5)* %foo12
|
||||
%foo14 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
|
||||
%foo15 = load double, double addrspace(5)* %foo14
|
||||
store double %foo10, ptr addrspace(5) %s
|
||||
%foo13 = load double, ptr addrspace(5) %s
|
||||
%foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
||||
%foo15 = load double, ptr addrspace(5) %foo14
|
||||
%foo16 = fadd double %foo13, %foo15
|
||||
%foo17 = fptrunc double %foo16 to float
|
||||
%foo18 = insertelement <4 x float> undef, float %foo17, i32 0
|
||||
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
|
||||
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
|
||||
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
|
||||
store <4 x float> %foo21, <4 x float> addrspace(1)* @frag_color
|
||||
store <4 x float> %foo21, ptr addrspace(1) @frag_color
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -234,22 +208,21 @@ define amdgpu_ps void @promote_double_aggr() #0 {
|
||||
define amdgpu_kernel void @alloca_struct() #0 {
|
||||
; CHECK-LABEL: @alloca_struct(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4, !invariant.load !0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 2
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32 addrspace(4)* [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], [1024 x [2 x %struct]] addrspace(3)* @alloca_struct.alloca, i32 0, i32 [[TMP14]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
|
||||
@ -5,45 +5,43 @@
|
||||
|
||||
; CHECK-LABEL: @array_alloca(
|
||||
; CHECK: %stack = alloca i32, i32 5, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
define amdgpu_kernel void @array_alloca(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
|
||||
entry:
|
||||
%stack = alloca i32, i32 5, align 4, addrspace(5)
|
||||
%ld0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
|
||||
%ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %ld2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
|
||||
%ld3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %ld3, i32 addrspace(1)* %arrayidx13
|
||||
%ld0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%ld2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %ld2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
|
||||
%ld3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %ld3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @array_alloca_dynamic(
|
||||
; CHECK: %stack = alloca i32, i32 %size, align 4, addrspace(5)
|
||||
define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
|
||||
define amdgpu_kernel void @array_alloca_dynamic(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %size) #0 {
|
||||
entry:
|
||||
%stack = alloca i32, i32 %size, align 4, addrspace(5)
|
||||
%ld0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
|
||||
%ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %ld2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
|
||||
%ld3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %ld3, i32 addrspace(1)* %arrayidx13
|
||||
%ld0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%ld2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %ld2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
|
||||
%ld3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %ld3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -1,28 +1,27 @@
|
||||
; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=IR %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=ASM %s
|
||||
|
||||
; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
|
||||
; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
|
||||
; IR: alloca [5 x i32]
|
||||
|
||||
; ASM-LABEL: {{^}}promote_alloca_shaders:
|
||||
; ASM: ; ScratchSize: 24
|
||||
define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
|
||||
define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%tmp0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%tmp2 = load i32, i32 addrspace(5)* %arrayidx4, align 4
|
||||
store i32 %tmp2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, i32 addrspace(5)* %arrayidx5
|
||||
%arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %arrayidx6
|
||||
%tmp0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%tmp2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %tmp2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx5 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, ptr addrspace(5) %arrayidx5
|
||||
%arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %tmp3, ptr addrspace(1) %arrayidx6
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -33,18 +32,17 @@ entry:
|
||||
; ASM-LABEL: {{^}}promote_to_vector_call_c:
|
||||
; ASM-NOT: LDSByteSize
|
||||
; ASM: ; ScratchSize: 12
|
||||
define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
define void @promote_to_vector_call_c(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
entry:
|
||||
%tmp = alloca [2 x i32], addrspace(5)
|
||||
%tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
store i32 0, i32 addrspace(5)* %tmp1
|
||||
store i32 1, i32 addrspace(5)* %tmp2
|
||||
%tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
|
||||
%tmp4 = load i32, i32 addrspace(5)* %tmp3
|
||||
%tmp5 = load volatile i32, i32 addrspace(1)* undef
|
||||
%tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %tmp2
|
||||
%tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
|
||||
%tmp4 = load i32, ptr addrspace(5) %tmp3
|
||||
%tmp5 = load volatile i32, ptr addrspace(1) undef
|
||||
%tmp6 = add i32 %tmp4, %tmp5
|
||||
store i32 %tmp6, i32 addrspace(1)* %out
|
||||
store i32 %tmp6, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -54,43 +52,41 @@ entry:
|
||||
; ASM-LABEL: {{^}}no_promote_to_lds_c:
|
||||
; ASM-NOT: LDSByteSize
|
||||
; ASM: ; ScratchSize: 24
|
||||
define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
define void @no_promote_to_lds_c(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %3, i32 addrspace(1)* %arrayidx13
|
||||
%0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %3, ptr addrspace(1) %arrayidx13
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @foo(i32 addrspace(5)*) #0
|
||||
declare i32 @foo(ptr addrspace(5)) #0
|
||||
|
||||
; ASM-LABEL: {{^}}call_private:
|
||||
; ASM: buffer_store_dword
|
||||
; ASM: buffer_store_dword
|
||||
; ASM: s_swappc_b64
|
||||
; ASM: ScratchSize: 16400
|
||||
define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
entry:
|
||||
%tmp = alloca [2 x i32], addrspace(5)
|
||||
%tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
store i32 0, i32 addrspace(5)* %tmp1
|
||||
store i32 1, i32 addrspace(5)* %tmp2
|
||||
%tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
|
||||
%val = call i32 @foo(i32 addrspace(5)* %tmp3)
|
||||
store i32 %val, i32 addrspace(1)* %out
|
||||
%tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %tmp2
|
||||
%tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
|
||||
%val = call i32 @foo(ptr addrspace(5) %tmp3)
|
||||
store i32 %val, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -1,23 +1,22 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
target datalayout = "A5"
|
||||
|
||||
declare {}* @llvm.invariant.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
|
||||
declare void @llvm.invariant.end.p5i8({}*, i64, i8 addrspace(5)* nocapture) #0
|
||||
declare i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)*) #1
|
||||
declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture) #0
|
||||
declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture) #0
|
||||
declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5)) #1
|
||||
|
||||
; GCN-LABEL: {{^}}use_invariant_promotable_lds:
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: ds_write_b32
|
||||
define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
|
||||
define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 {
|
||||
bb:
|
||||
%tmp = alloca i32, align 4, addrspace(5)
|
||||
%tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
|
||||
%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
%tmp3 = load i32, i32 addrspace(1)* %tmp2
|
||||
store i32 %tmp3, i32 addrspace(5)* %tmp
|
||||
%tmp4 = call {}* @llvm.invariant.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) #0
|
||||
call void @llvm.invariant.end.p5i8({}* %tmp4, i64 4, i8 addrspace(5)* %tmp1) #0
|
||||
%tmp5 = call i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)* %tmp1) #1
|
||||
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
|
||||
%tmp3 = load i32, ptr addrspace(1) %tmp2
|
||||
store i32 %tmp3, ptr addrspace(5) %tmp
|
||||
%tmp4 = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #0
|
||||
call void @llvm.invariant.end.p5(ptr %tmp4, i64 4, ptr addrspace(5) %tmp) #0
|
||||
%tmp5 = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %tmp) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -2,22 +2,21 @@
|
||||
|
||||
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
|
||||
|
||||
declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
|
||||
declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0
|
||||
declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0
|
||||
declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0
|
||||
|
||||
; OPT-LABEL: @use_lifetime_promotable_lds(
|
||||
; OPT-NOT: alloca i32
|
||||
; OPT-NOT: llvm.lifetime
|
||||
; OPT: store i32 %tmp3, i32 addrspace(3)*
|
||||
define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
|
||||
; OPT: store i32 %tmp3, ptr addrspace(3)
|
||||
define amdgpu_kernel void @use_lifetime_promotable_lds(ptr addrspace(1) %arg) #2 {
|
||||
bb:
|
||||
%tmp = alloca i32, align 4, addrspace(5)
|
||||
%tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
|
||||
call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %tmp1)
|
||||
%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
%tmp3 = load i32, i32 addrspace(1)* %tmp2
|
||||
store i32 %tmp3, i32 addrspace(5)* %tmp
|
||||
call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %tmp1)
|
||||
call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %tmp)
|
||||
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
|
||||
%tmp3 = load i32, ptr addrspace(1) %tmp2
|
||||
store i32 %tmp3, ptr addrspace(5) %tmp
|
||||
call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %tmp)
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -29,7 +28,7 @@ bb:
|
||||
define amdgpu_kernel void @iterator_erased_lifetime() {
|
||||
entry:
|
||||
%alloca = alloca i8, align 1, addrspace(5)
|
||||
call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %alloca)
|
||||
call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -1,95 +1,77 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s
|
||||
|
||||
declare void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
|
||||
declare void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
|
||||
declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
|
||||
declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
|
||||
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
|
||||
declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0
|
||||
|
||||
declare void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
|
||||
declare void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
|
||||
declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
|
||||
declare void @llvm.memmove.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
|
||||
declare void @llvm.memmove.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
|
||||
declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0
|
||||
|
||||
declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i1) #0
|
||||
declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i1) #0
|
||||
|
||||
declare i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)*, i1, i1, i1) #1
|
||||
declare i32 @llvm.objectsize.i32.p5(ptr addrspace(5), i1, i1, i1) #1
|
||||
|
||||
; CHECK-LABEL: @promote_with_memcpy(
|
||||
; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
|
||||
; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
||||
; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
|
||||
; CHECK: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memcpy(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
%alloca = alloca [17 x i32], align 4, addrspace(5)
|
||||
%alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
|
||||
%out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
|
||||
call void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
|
||||
call void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
|
||||
call void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
|
||||
call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @promote_with_memmove(
|
||||
; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
|
||||
; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
||||
; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memmove.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
|
||||
; CHECK: call void @llvm.memmove.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memmove(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
%alloca = alloca [17 x i32], align 4, addrspace(5)
|
||||
%alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
|
||||
%out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
|
||||
call void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
|
||||
call void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
|
||||
call void @llvm.memmove.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
|
||||
call void @llvm.memmove.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @promote_with_memset(
|
||||
; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
|
||||
; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call void @llvm.memset.p3.i32(ptr addrspace(3) align 4 [[GEP]], i8 7, i32 68, i1 false)
|
||||
define amdgpu_kernel void @promote_with_memset(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
%alloca = alloca [17 x i32], align 4, addrspace(5)
|
||||
%alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
|
||||
%out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
|
||||
call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
|
||||
call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 7, i32 68, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @promote_with_objectsize(
|
||||
; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false, i1 false)
|
||||
define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
|
||||
; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false)
|
||||
define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 {
|
||||
%alloca = alloca [17 x i32], align 4, addrspace(5)
|
||||
%alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%size = call i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)* %alloca.bc, i1 false, i1 false, i1 false)
|
||||
store i32 %size, i32 addrspace(1)* %out
|
||||
%size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, i1 false, i1 false)
|
||||
store i32 %size, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
|
||||
; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
|
||||
; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
|
||||
; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
|
||||
; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
|
||||
define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {
|
||||
entry:
|
||||
%r = alloca double, align 8, addrspace(5)
|
||||
%arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
|
||||
%i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
|
||||
%arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
|
||||
%i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
|
||||
call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
|
||||
%arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
|
||||
%arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
|
||||
call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @promote_alloca_used_twice_in_memmove(
|
||||
; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
|
||||
; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
|
||||
; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
|
||||
; CHECK: call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
|
||||
define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) {
|
||||
entry:
|
||||
%r = alloca double, align 8, addrspace(5)
|
||||
%arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
|
||||
%i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
|
||||
%arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
|
||||
%i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
|
||||
call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
|
||||
%arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
|
||||
%arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
|
||||
call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -5,32 +5,30 @@
|
||||
; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
|
||||
; NOOPTS-NOT: ds_write
|
||||
; OPTS: ds_write
|
||||
define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
|
||||
define amdgpu_kernel void @promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
|
||||
entry:
|
||||
%alloca = alloca [2 x [2 x i32]], addrspace(5)
|
||||
%gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
|
||||
store i32 0, i32 addrspace(5)* %gep0
|
||||
store i32 1, i32 addrspace(5)* %gep1
|
||||
%gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
|
||||
%load = load i32, i32 addrspace(5)* %gep2
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
|
||||
store i32 0, ptr addrspace(5) %alloca
|
||||
store i32 1, ptr addrspace(5) %gep1
|
||||
%gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
|
||||
%load = load i32, ptr addrspace(5) %gep2
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
|
||||
; ALL: workgroup_group_segment_byte_size = 0{{$}}
|
||||
; ALL-NOT: ds_write
|
||||
define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
|
||||
define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #1 {
|
||||
entry:
|
||||
%alloca = alloca [2 x [2 x i32]], addrspace(5)
|
||||
%gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
|
||||
store i32 0, i32 addrspace(5)* %gep0
|
||||
store i32 1, i32 addrspace(5)* %gep1
|
||||
%gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
|
||||
%load = load i32, i32 addrspace(5)* %gep2
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
|
||||
store i32 0, ptr addrspace(5) %alloca
|
||||
store i32 1, ptr addrspace(5) %gep1
|
||||
%gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
|
||||
%load = load i32, ptr addrspace(5) %gep2
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -32,64 +32,62 @@
|
||||
|
||||
; GCN-LABEL: {{^}}promote_alloca_size_order_0:
|
||||
; GCN: workgroup_group_segment_byte_size = 1060
|
||||
define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @promote_alloca_size_order_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%tmp0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %tmp2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %arrayidx13
|
||||
%tmp0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%tmp2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %tmp2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %tmp3, ptr addrspace(1) %arrayidx13
|
||||
|
||||
%gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
|
||||
store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
|
||||
%gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
|
||||
store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
|
||||
|
||||
%gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
|
||||
store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
|
||||
%gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
|
||||
store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
|
||||
|
||||
%gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
|
||||
%gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}promote_alloca_size_order_1:
|
||||
; GCN: workgroup_group_segment_byte_size = 1072
|
||||
define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @promote_alloca_size_order_1(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%tmp0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %tmp2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %arrayidx13
|
||||
%tmp0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%tmp2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %tmp2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %tmp3, ptr addrspace(1) %arrayidx13
|
||||
|
||||
%gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
|
||||
%gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
|
||||
|
||||
%gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
|
||||
store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
|
||||
%gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
|
||||
store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
|
||||
|
||||
%gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
|
||||
store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
|
||||
%gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
|
||||
store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
|
||||
|
||||
ret void
|
||||
}
|
||||
@ -102,29 +100,28 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
|
||||
; GCN: workgroup_group_segment_byte_size = 1060
|
||||
define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4, addrspace(5)
|
||||
%tmp0 = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
|
||||
store i32 4, i32 addrspace(5)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
|
||||
%tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
|
||||
store i32 5, i32 addrspace(5)* %arrayidx3, align 4
|
||||
%arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
|
||||
store i32 %tmp2, i32 addrspace(1)* %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, i32 addrspace(5)* %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %arrayidx13
|
||||
%tmp0 = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
|
||||
store i32 4, ptr addrspace(5) %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
|
||||
%tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
|
||||
store i32 5, ptr addrspace(5) %arrayidx3, align 4
|
||||
%tmp2 = load i32, ptr addrspace(5) %stack, align 4
|
||||
store i32 %tmp2, ptr addrspace(1) %out, align 4
|
||||
%arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%tmp3 = load i32, ptr addrspace(5) %arrayidx12
|
||||
%arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
|
||||
store i32 %tmp3, ptr addrspace(1) %arrayidx13
|
||||
|
||||
%gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
|
||||
store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
|
||||
%gep.lds3 = getelementptr inbounds [13 x i32], ptr addrspace(3) @lds3, i32 0, i32 %idx
|
||||
store volatile i32 0, ptr addrspace(3) %gep.lds3, align 4
|
||||
|
||||
%gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
|
||||
%gep.lds4 = getelementptr inbounds [63 x <4 x i32>], ptr addrspace(3) @lds4, i32 0, i32 %idx
|
||||
store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds4, align 16
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -4,25 +4,19 @@
|
||||
define i64 @test_pointer_array(i64 %v) {
|
||||
; OPT-LABEL: @test_pointer_array(
|
||||
; OPT-NEXT: entry:
|
||||
; OPT-NEXT: [[A:%.*]] = alloca [3 x i8*], align 16, addrspace(5)
|
||||
; OPT-NEXT: [[GEP:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* [[A]], i32 0, i32 0
|
||||
; OPT-NEXT: [[CAST:%.*]] = bitcast i8* addrspace(5)* [[GEP]] to i64 addrspace(5)*
|
||||
; OPT-NEXT: [[TMP0:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
|
||||
; OPT-NEXT: [[TMP1:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP0]], align 32
|
||||
; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to i8*
|
||||
; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x i8*> [[TMP1]], i8* [[TMP2]], i32 0
|
||||
; OPT-NEXT: store <3 x i8*> [[TMP3]], <3 x i8*> addrspace(5)* [[TMP0]], align 32
|
||||
; OPT-NEXT: [[TMP4:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
|
||||
; OPT-NEXT: [[TMP5:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP4]], align 32
|
||||
; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x i8*> [[TMP5]], i32 0
|
||||
; OPT-NEXT: [[TMP7:%.*]] = ptrtoint i8* [[TMP6]] to i64
|
||||
; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
|
||||
; OPT-NEXT: [[TMP1:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
|
||||
; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to ptr
|
||||
; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x ptr> [[TMP1]], ptr [[TMP2]], i32 0
|
||||
; OPT-NEXT: store <3 x ptr> [[TMP3]], ptr addrspace(5) [[A]], align 32
|
||||
; OPT-NEXT: [[TMP5:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
|
||||
; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x ptr> [[TMP5]], i32 0
|
||||
; OPT-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
|
||||
; OPT-NEXT: ret i64 [[TMP7]]
|
||||
;
|
||||
entry:
|
||||
%a = alloca [3 x i8*], align 16, addrspace(5)
|
||||
%gep = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* %a, i32 0, i32 0
|
||||
%cast = bitcast i8* addrspace(5)* %gep to i64 addrspace(5)*
|
||||
store i64 %v, i64 addrspace(5)* %cast, align 16
|
||||
%ld = load i64, i64 addrspace(5)* %cast, align 16
|
||||
%a = alloca [3 x ptr], align 16, addrspace(5)
|
||||
store i64 %v, ptr addrspace(5) %a, align 16
|
||||
%ld = load i64, ptr addrspace(5) %a, align 16
|
||||
ret i64 %ld
|
||||
}
|
||||
|
||||
@ -5,22 +5,22 @@
|
||||
|
||||
; GCN-LABEL: {{^}}stored_lds_pointer_value:
|
||||
; GCN: buffer_store_dword v
|
||||
define amdgpu_kernel void @stored_lds_pointer_value(float addrspace(5)* addrspace(1)* %ptr) #0 {
|
||||
define amdgpu_kernel void @stored_lds_pointer_value(ptr addrspace(1) %ptr) #0 {
|
||||
%tmp = alloca float, addrspace(5)
|
||||
store float 0.0, float addrspace(5)*%tmp
|
||||
store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr
|
||||
store float 0.0, ptr addrspace(5) %tmp
|
||||
store ptr addrspace(5) %tmp, ptr addrspace(1) %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
|
||||
; GCN: buffer_store_dword v
|
||||
define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* addrspace(1)* %ptr) #0 {
|
||||
define amdgpu_kernel void @stored_lds_pointer_value_offset(ptr addrspace(1) %ptr) #0 {
|
||||
%tmp0 = alloca float, addrspace(5)
|
||||
%tmp1 = alloca float, addrspace(5)
|
||||
store float 0.0, float addrspace(5)*%tmp0
|
||||
store float 0.0, float addrspace(5)*%tmp1
|
||||
store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(1)* %ptr
|
||||
store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr
|
||||
store float 0.0, ptr addrspace(5) %tmp0
|
||||
store float 0.0, ptr addrspace(5) %tmp1
|
||||
store volatile ptr addrspace(5) %tmp0, ptr addrspace(1) %ptr
|
||||
store volatile ptr addrspace(5) %tmp1, ptr addrspace(1) %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -29,12 +29,12 @@ define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* a
|
||||
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
|
||||
; GCN: buffer_store_dword v
|
||||
; GCN: buffer_store_dword v
|
||||
define amdgpu_kernel void @stored_lds_pointer_value_gep(float addrspace(5)* addrspace(1)* %ptr, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @stored_lds_pointer_value_gep(ptr addrspace(1) %ptr, i32 %idx) #0 {
|
||||
bb:
|
||||
%tmp = alloca float, i32 16, addrspace(5)
|
||||
store float 0.0, float addrspace(5)* %tmp
|
||||
%tmp2 = getelementptr inbounds float, float addrspace(5)* %tmp, i32 %idx
|
||||
store float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr
|
||||
store float 0.0, ptr addrspace(5) %tmp
|
||||
%tmp2 = getelementptr inbounds float, ptr addrspace(5) %tmp, i32 %idx
|
||||
store ptr addrspace(5) %tmp2, ptr addrspace(1) %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -46,29 +46,27 @@ bb:
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
define amdgpu_kernel void @stored_vector_pointer_value(i32 addrspace(5)* addrspace(1)* %out, i32 %index) {
|
||||
define amdgpu_kernel void @stored_vector_pointer_value(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp0 = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 0
|
||||
%y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 %index
|
||||
store i32 addrspace(5)* %tmp1, i32 addrspace(5)* addrspace(1)* %out
|
||||
%y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp0
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 %index
|
||||
store ptr addrspace(5) %tmp1, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}stored_fi_to_self:
|
||||
; GCN-NOT: ds_
|
||||
define amdgpu_kernel void @stored_fi_to_self() #0 {
|
||||
%tmp = alloca i32 addrspace(5)*, addrspace(5)
|
||||
store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp
|
||||
%bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)*
|
||||
store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp
|
||||
%tmp = alloca ptr addrspace(5), addrspace(5)
|
||||
store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp
|
||||
store volatile ptr addrspace(5) %tmp, ptr addrspace(5) %tmp
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -3,23 +3,22 @@
|
||||
; This kernel starts with the amdgpu-no-workitem-id-* attributes, but
|
||||
; need to be removed when these intrinsic uses are introduced.
|
||||
|
||||
; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
; CHECK: call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
|
||||
; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
|
||||
; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
|
||||
define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
|
||||
define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
entry:
|
||||
%tmp = alloca [2 x i32], addrspace(5)
|
||||
%tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
store i32 0, i32 addrspace(5)* %tmp1
|
||||
store i32 1, i32 addrspace(5)* %tmp2
|
||||
%tmp3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
|
||||
%tmp4 = load i32, i32 addrspace(5)* %tmp3
|
||||
%tmp5 = load volatile i32, i32 addrspace(1)* undef
|
||||
%tmp2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %tmp2
|
||||
%tmp3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
|
||||
%tmp4 = load i32, ptr addrspace(5) %tmp3
|
||||
%tmp5 = load volatile i32, ptr addrspace(1) undef
|
||||
%tmp6 = add i32 %tmp4, %tmp5
|
||||
store i32 %tmp6, i32 addrspace(1)* %out
|
||||
store i32 %tmp6, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -7,8 +7,8 @@ target datalayout = "A5"
|
||||
@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
|
||||
@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4
|
||||
|
||||
@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
|
||||
@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
|
||||
@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4
|
||||
@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4
|
||||
|
||||
; This function cannot promote to using LDS because of the size of the
|
||||
; constant expression use in the function, which was previously not
|
||||
@ -18,22 +18,21 @@ target datalayout = "A5"
|
||||
|
||||
; ASM-LABEL: constant_expression_uses_all_lds:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 65536
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
|
||||
store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef
|
||||
store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -45,21 +44,20 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -71,47 +69,44 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%gep_dyn_lds = getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0
|
||||
store i32 1234, i32* %gep_dyn_lds, align 4
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @callee(i8*)
|
||||
declare void @callee(ptr)
|
||||
|
||||
; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
|
||||
; IR: alloca
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr))
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -121,21 +116,20 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr))
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -144,21 +138,20 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*))
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr))
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -168,22 +161,21 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
|
||||
; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
|
||||
store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
|
||||
store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -195,21 +187,20 @@ entry:
|
||||
|
||||
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
|
||||
; ASM: .group_segment_fixed_size: 65536
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
|
||||
define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
|
||||
store i32 9, i32 addrspace(5)* %gep0
|
||||
store i32 10, i32 addrspace(5)* %gep1
|
||||
store i32 99, i32 addrspace(5)* %gep2
|
||||
store i32 43, i32 addrspace(5)* %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
|
||||
%load = load i32, i32 addrspace(5)* %arrayidx, align 4
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
|
||||
%gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
|
||||
%gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
|
||||
%gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
|
||||
store i32 9, ptr addrspace(5) %stack
|
||||
store i32 10, ptr addrspace(5) %gep1
|
||||
store i32 99, ptr addrspace(5) %gep2
|
||||
store i32 43, ptr addrspace(5) %gep3
|
||||
%arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
|
||||
%load = load i32, ptr addrspace(5) %arrayidx, align 4
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -2,86 +2,86 @@
|
||||
|
||||
|
||||
; CHECK-LABEL: @branch_ptr_var_same_alloca(
|
||||
; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
|
||||
; CHECK: if:
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %a
|
||||
|
||||
; CHECK: else:
|
||||
; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
|
||||
; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %b
|
||||
|
||||
; CHECK: endif:
|
||||
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
|
||||
; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
|
||||
define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
br i1 undef, label %if, label %else
|
||||
|
||||
if:
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
br label %endif
|
||||
|
||||
else:
|
||||
%arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %b
|
||||
%arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
%phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
|
||||
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
|
||||
; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ null, %entry ]
|
||||
define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
br i1 undef, label %if, label %endif
|
||||
|
||||
if:
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ null, %entry ]
|
||||
store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
%phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ null, %entry ]
|
||||
store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
|
||||
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ]
|
||||
; CHECK: %phi.ptr = phi ptr addrspace(3) [ null, %entry ], [ %arrayidx0, %if ]
|
||||
define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
br i1 undef, label %if, label %endif
|
||||
|
||||
if:
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%phi.ptr = phi i32 addrspace(5)* [ null, %entry ], [ %arrayidx0, %if ]
|
||||
store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
%phi.ptr = phi ptr addrspace(5) [ null, %entry ], [ %arrayidx0, %if ]
|
||||
store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @one_phi_value(
|
||||
; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
|
||||
; CHECK: [[GEP0:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @one_phi_value.alloca, i32 0, i32 %{{[0-9]+}}
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP0]], i32 0, i32 %a
|
||||
|
||||
; CHECK: br label %exit
|
||||
; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
|
||||
; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
|
||||
; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %entry ]
|
||||
; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
|
||||
define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
%phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %entry ]
|
||||
store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
%phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %entry ]
|
||||
store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -89,30 +89,30 @@ exit:
|
||||
; CHECK: %alloca = alloca [64 x i32], align 4
|
||||
|
||||
; CHECK: if:
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
|
||||
; CHECK: else:
|
||||
; CHECK: %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
|
||||
; CHECK: %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()
|
||||
|
||||
; CHECK: endif:
|
||||
; CHECK: %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
; CHECK: store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
; CHECK: %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
; CHECK: store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
br i1 undef, label %if, label %else
|
||||
|
||||
if:
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
|
||||
%arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
|
||||
br label %endif
|
||||
|
||||
else:
|
||||
%arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
|
||||
%arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
store i32 0, i32 addrspace(5)* %phi.ptr, align 4
|
||||
%phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
|
||||
store i32 0, ptr addrspace(5) %phi.ptr, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -133,12 +133,12 @@ endif:
|
||||
|
||||
; CHECK-LABEL: @ptr_induction_var_same_alloca(
|
||||
; CHECK: %alloca = alloca [64 x i32], align 4
|
||||
; CHECK: phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
|
||||
; CHECK: phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
|
||||
define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
%arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
|
||||
%arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 48
|
||||
%arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
|
||||
%arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 48
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
@ -146,11 +146,11 @@ for.cond.cleanup: ; preds = %for.body
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%p.08 = phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
|
||||
store i32 %i.09, i32 addrspace(5)* %p.08, align 4
|
||||
%incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
|
||||
%p.08 = phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
|
||||
store i32 %i.09, ptr addrspace(5) %p.08, align 4
|
||||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
|
||||
%inc = add nuw nsw i32 %i.09, 1
|
||||
%cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %arrayidx1
|
||||
%cmp = icmp eq ptr addrspace(5) %incdec.ptr, %arrayidx1
|
||||
br i1 %cmp, label %for.cond.cleanup, label %for.body
|
||||
}
|
||||
|
||||
@ -170,14 +170,14 @@ for.body: ; preds = %for.body, %entry
|
||||
|
||||
; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
|
||||
; CHECK: %alloca = alloca [64 x i32], align 4
|
||||
; CHECK: %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
|
||||
; CHECK: %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
|
||||
; CHECK: %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
|
||||
; CHECK: %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
|
||||
define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
|
||||
entry:
|
||||
%alloca = alloca [64 x i32], align 4, addrspace(5)
|
||||
%arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
|
||||
%call = tail call i32 addrspace(5)* @get_unknown_pointer() #2
|
||||
%cmp.7 = icmp eq i32 addrspace(5)* %arrayidx, %call
|
||||
%arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
|
||||
%call = tail call ptr addrspace(5) @get_unknown_pointer() #2
|
||||
%cmp.7 = icmp eq ptr addrspace(5) %arrayidx, %call
|
||||
br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
@ -191,14 +191,14 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo
|
||||
|
||||
for.body: ; preds = %for.body, %for.body.preheader
|
||||
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
|
||||
store i32 %i.09, i32 addrspace(5)* %p.08, align 4
|
||||
%incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
|
||||
%p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
|
||||
store i32 %i.09, ptr addrspace(5) %p.08, align 4
|
||||
%incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
|
||||
%inc = add nuw nsw i32 %i.09, 1
|
||||
%cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
|
||||
%cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
|
||||
br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
||||
|
||||
declare i32 addrspace(5)* @get_unknown_pointer() #0
|
||||
declare ptr addrspace(5) @get_unknown_pointer() #0
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
|
||||
|
||||
@ -3,19 +3,18 @@
|
||||
; This is just an arbitrary intrinisic that shouldn't ever need to be
|
||||
; handled to ensure it doesn't crash.
|
||||
|
||||
declare void @llvm.stackrestore(i8*) #2
|
||||
declare void @llvm.stackrestore(ptr) #2
|
||||
|
||||
; CHECK-LABEL: @try_promote_unhandled_intrinsic(
|
||||
; CHECK: alloca
|
||||
; CHECK: call void @llvm.stackrestore(i8* %tmp1)
|
||||
define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
|
||||
; CHECK: call void @llvm.stackrestore(ptr %tmp)
|
||||
define amdgpu_kernel void @try_promote_unhandled_intrinsic(ptr addrspace(1) %arg) #2 {
|
||||
bb:
|
||||
%tmp = alloca i32, align 4
|
||||
%tmp1 = bitcast i32* %tmp to i8*
|
||||
%tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
%tmp3 = load i32, i32 addrspace(1)* %tmp2
|
||||
store i32 %tmp3, i32* %tmp
|
||||
call void @llvm.stackrestore(i8* %tmp1)
|
||||
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
|
||||
%tmp3 = load i32, ptr addrspace(1) %tmp2
|
||||
store i32 %tmp3, ptr %tmp
|
||||
call void @llvm.stackrestore(ptr %tmp)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -11,13 +11,13 @@
|
||||
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
|
||||
; GCN: store_dword v{{.+}}, [[RES]]
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
|
||||
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
|
||||
; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = extractelement <4 x float> %0, i32 %sel2
|
||||
; OPT: store float %1, float addrspace(1)* %out, align 4
|
||||
; OPT: store float %1, ptr addrspace(1) %out, align 4
|
||||
|
||||
define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x float>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -26,10 +26,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
|
||||
%load = load float, float addrspace(5)* %gep, align 4
|
||||
store float %load, float addrspace(1)* %out, align 4
|
||||
%gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, ptr addrspace(5) %alloca, align 4
|
||||
%load = load float, ptr addrspace(5) %gep, align 4
|
||||
store float %load, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -46,14 +46,14 @@ entry:
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
|
||||
; GCN: store_dwordx4 v{{.+}},
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
|
||||
; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
|
||||
; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
|
||||
; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
|
||||
; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
|
||||
; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
|
||||
; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4
|
||||
|
||||
define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x float>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -62,10 +62,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store float 1.0, float addrspace(5)* %gep, align 4
|
||||
%load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
|
||||
store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
|
||||
%gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store float 1.0, ptr addrspace(5) %gep, align 4
|
||||
%load = load <4 x float>, ptr addrspace(5) %alloca, align 4
|
||||
store <4 x float> %load, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -77,13 +77,13 @@ entry:
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
|
||||
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
|
||||
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
|
||||
; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
|
||||
; OPT: store half %1, half addrspace(1)* %out, align 2
|
||||
; OPT: store half %1, ptr addrspace(1) %out, align 2
|
||||
|
||||
define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x half>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -92,10 +92,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
|
||||
%load = load half, half addrspace(5)* %gep, align 2
|
||||
store half %load, half addrspace(1)* %out, align 2
|
||||
%gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, ptr addrspace(5) %alloca, align 2
|
||||
%load = load half, ptr addrspace(5) %gep, align 2
|
||||
store half %load, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -105,14 +105,14 @@ entry:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
|
||||
; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
|
||||
; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
|
||||
; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
|
||||
; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
|
||||
; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
|
||||
; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
|
||||
|
||||
define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x half>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -121,10 +121,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store half 1.0, half addrspace(5)* %gep, align 4
|
||||
%load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
|
||||
store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
|
||||
%gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store half 1.0, ptr addrspace(5) %gep, align 4
|
||||
%load = load <4 x half>, ptr addrspace(5) %alloca, align 2
|
||||
store <4 x half> %load, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -136,13 +136,13 @@ entry:
|
||||
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
|
||||
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
|
||||
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
|
||||
; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
|
||||
; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
|
||||
; OPT: store i16 %1, ptr addrspace(1) %out, align 2
|
||||
|
||||
define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x i16>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -151,10 +151,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
|
||||
%load = load i16, i16 addrspace(5)* %gep, align 2
|
||||
store i16 %load, i16 addrspace(1)* %out, align 2
|
||||
%gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
|
||||
%load = load i16, ptr addrspace(5) %gep, align 2
|
||||
store i16 %load, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -164,14 +164,14 @@ entry:
|
||||
; GCN-NOT: buffer_
|
||||
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
|
||||
|
||||
; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
|
||||
; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
|
||||
; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
|
||||
; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
|
||||
; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
|
||||
; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
|
||||
; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
|
||||
; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
|
||||
; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
|
||||
|
||||
define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
|
||||
define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
|
||||
entry:
|
||||
%alloca = alloca <4 x i16>, align 16, addrspace(5)
|
||||
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -180,10 +180,10 @@ entry:
|
||||
%c2 = icmp uge i32 %y, 3
|
||||
%sel1 = select i1 %c1, i32 1, i32 2
|
||||
%sel2 = select i1 %c2, i32 0, i32 %sel1
|
||||
%gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
|
||||
store i16 1, i16 addrspace(5)* %gep, align 4
|
||||
%load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
|
||||
store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
|
||||
%gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
|
||||
store i16 1, ptr addrspace(5) %gep, align 4
|
||||
%load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
|
||||
store <4 x i16> %load, ptr addrspace(1) %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -194,14 +194,12 @@ entry:
|
||||
; GCN: v_mov_b32_e32 v1, 0
|
||||
|
||||
; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
|
||||
; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
|
||||
; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
|
||||
; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
|
||||
|
||||
define i64 @ptr_alloca_bitcast() {
|
||||
entry:
|
||||
%private_iptr = alloca <2 x i32>, align 8, addrspace(5)
|
||||
%cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
|
||||
%tmp1 = load i64, i64 addrspace(5)* %cast, align 8
|
||||
%tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
|
||||
ret i64 %tmp1
|
||||
}
|
||||
|
||||
|
||||
@ -2,26 +2,26 @@
|
||||
|
||||
; CHECK-LABEL: @volatile_load(
|
||||
; CHECK: alloca [4 x i32]
|
||||
; CHECK: load volatile i32, i32 addrspace(5)*
|
||||
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||
; CHECK: load volatile i32, ptr addrspace(5)
|
||||
define amdgpu_kernel void @volatile_load(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%tmp = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
|
||||
%load = load volatile i32, i32 addrspace(5)* %arrayidx1
|
||||
store i32 %load, i32 addrspace(1)* %out
|
||||
%tmp = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
|
||||
%load = load volatile i32, ptr addrspace(5) %arrayidx1
|
||||
store i32 %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @volatile_store(
|
||||
; CHECK: alloca [4 x i32]
|
||||
; CHECK: store volatile i32 %tmp, i32 addrspace(5)*
|
||||
define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
|
||||
; CHECK: store volatile i32 %tmp, ptr addrspace(5)
|
||||
define amdgpu_kernel void @volatile_store(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
|
||||
entry:
|
||||
%stack = alloca [4 x i32], align 4, addrspace(5)
|
||||
%tmp = load i32, i32 addrspace(1)* %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
|
||||
store volatile i32 %tmp, i32 addrspace(5)* %arrayidx1
|
||||
%tmp = load i32, ptr addrspace(1) %in, align 4
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
|
||||
store volatile i32 %tmp, ptr addrspace(5) %arrayidx1
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -30,15 +30,15 @@ entry:
|
||||
; CHECK: alloca double
|
||||
; CHECK: load double
|
||||
; CHECK: load volatile double
|
||||
define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
|
||||
define amdgpu_kernel void @volatile_and_non_volatile_load(ptr addrspace(1) nocapture %arg, i32 %arg1) #0 {
|
||||
bb:
|
||||
%tmp = alloca double, align 8, addrspace(5)
|
||||
store double 0.000000e+00, double addrspace(5)* %tmp, align 8
|
||||
store double 0.000000e+00, ptr addrspace(5) %tmp, align 8
|
||||
|
||||
%tmp4 = load double, double addrspace(5)* %tmp, align 8
|
||||
%tmp5 = load volatile double, double addrspace(5)* %tmp, align 8
|
||||
%tmp4 = load double, ptr addrspace(5) %tmp, align 8
|
||||
%tmp5 = load volatile double, ptr addrspace(5) %tmp, align 8
|
||||
|
||||
store double %tmp4, double addrspace(1)* %arg
|
||||
store double %tmp4, ptr addrspace(1) %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -4,34 +4,34 @@
|
||||
|
||||
; CHECK-LABEL: @test_insertelement(
|
||||
; CHECK: %alloca = alloca i16
|
||||
; CHECK-NEXT: insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
|
||||
; CHECK-NEXT: insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
|
||||
define amdgpu_kernel void @test_insertelement() #0 {
|
||||
entry:
|
||||
%alloca = alloca i16, align 4, addrspace(5)
|
||||
%in = insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
|
||||
store <2 x i16 addrspace(5)*> %in, <2 x i16 addrspace(5)*>* undef, align 4
|
||||
%in = insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
|
||||
store <2 x ptr addrspace(5)> %in, ptr undef, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_insertvalue(
|
||||
; CHECK: %alloca = alloca i16
|
||||
; CHECK-NEXT: insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
|
||||
; CHECK-NEXT: insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
|
||||
define amdgpu_kernel void @test_insertvalue() #0 {
|
||||
entry:
|
||||
%alloca = alloca i16, align 4, addrspace(5)
|
||||
%in = insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
|
||||
store { i16 addrspace(5)* } %in, { i16 addrspace(5)* }* undef, align 4
|
||||
%in = insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
|
||||
store { ptr addrspace(5) } %in, ptr undef, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_insertvalue_array(
|
||||
; CHECK: %alloca = alloca i16
|
||||
; CHECK-NEXT: insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
|
||||
; CHECK-NEXT: insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
|
||||
define amdgpu_kernel void @test_insertvalue_array() #0 {
|
||||
entry:
|
||||
%alloca = alloca i16, align 4, addrspace(5)
|
||||
%in = insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
|
||||
store [2 x i16 addrspace(5)*] %in, [2 x i16 addrspace(5)*]* undef, align 4
|
||||
%in = insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
|
||||
store [2 x ptr addrspace(5)] %in, ptr undef, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -5,61 +5,58 @@
|
||||
; OPT-LABEL: @vector_alloca_not_atomic(
|
||||
;
|
||||
; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
|
||||
define amdgpu_kernel void @vector_alloca_not_atomic(i32 addrspace(1)* %out, i64 %index) {
|
||||
define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
|
||||
entry:
|
||||
%alloca = alloca [3 x i32], addrspace(5)
|
||||
%a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
|
||||
%a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
|
||||
store i32 0, i32 addrspace(5)* %a0
|
||||
store i32 1, i32 addrspace(5)* %a1
|
||||
store i32 2, i32 addrspace(5)* %a2
|
||||
%tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
|
||||
%data = load i32, i32 addrspace(5)* %tmp
|
||||
store i32 %data, i32 addrspace(1)* %out
|
||||
%a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
|
||||
store i32 0, ptr addrspace(5) %alloca
|
||||
store i32 1, ptr addrspace(5) %a1
|
||||
store i32 2, ptr addrspace(5) %a2
|
||||
%tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
|
||||
%data = load i32, ptr addrspace(5) %tmp
|
||||
store i32 %data, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_alloca_atomic_read(
|
||||
;
|
||||
; OPT: alloca [3 x i32]
|
||||
; OPT: store i32 0, i32 addrspace(5)*
|
||||
; OPT: store i32 1, i32 addrspace(5)*
|
||||
; OPT: store i32 2, i32 addrspace(5)*
|
||||
; OPT: load atomic i32, i32 addrspace(5)*
|
||||
define amdgpu_kernel void @vector_alloca_atomic_read(i32 addrspace(1)* %out, i64 %index) {
|
||||
; OPT: store i32 0, ptr addrspace(5)
|
||||
; OPT: store i32 1, ptr addrspace(5)
|
||||
; OPT: store i32 2, ptr addrspace(5)
|
||||
; OPT: load atomic i32, ptr addrspace(5)
|
||||
define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
|
||||
entry:
|
||||
%alloca = alloca [3 x i32], addrspace(5)
|
||||
%a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
|
||||
%a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
|
||||
store i32 0, i32 addrspace(5)* %a0
|
||||
store i32 1, i32 addrspace(5)* %a1
|
||||
store i32 2, i32 addrspace(5)* %a2
|
||||
%tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
|
||||
%data = load atomic i32, i32 addrspace(5)* %tmp acquire, align 4
|
||||
store i32 %data, i32 addrspace(1)* %out
|
||||
%a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
|
||||
store i32 0, ptr addrspace(5) %alloca
|
||||
store i32 1, ptr addrspace(5) %a1
|
||||
store i32 2, ptr addrspace(5) %a2
|
||||
%tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
|
||||
%data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
|
||||
store i32 %data, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_alloca_atomic_write(
|
||||
;
|
||||
; OPT: alloca [3 x i32]
|
||||
; OPT: store atomic i32 0, i32 addrspace(5)
|
||||
; OPT: store atomic i32 1, i32 addrspace(5)
|
||||
; OPT: store atomic i32 2, i32 addrspace(5)
|
||||
; OPT: load i32, i32 addrspace(5)*
|
||||
define amdgpu_kernel void @vector_alloca_atomic_write(i32 addrspace(1)* %out, i64 %index) {
|
||||
; OPT: store atomic i32 0, ptr addrspace(5)
|
||||
; OPT: store atomic i32 1, ptr addrspace(5)
|
||||
; OPT: store atomic i32 2, ptr addrspace(5)
|
||||
; OPT: load i32, ptr addrspace(5)
|
||||
define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
|
||||
entry:
|
||||
%alloca = alloca [3 x i32], addrspace(5)
|
||||
%a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
|
||||
%a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
|
||||
store atomic i32 0, i32 addrspace(5)* %a0 release, align 4
|
||||
store atomic i32 1, i32 addrspace(5)* %a1 release, align 4
|
||||
store atomic i32 2, i32 addrspace(5)* %a2 release, align 4
|
||||
%tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
|
||||
%data = load i32, i32 addrspace(5)* %tmp
|
||||
store i32 %data, i32 addrspace(1)* %out
|
||||
%a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
|
||||
%a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
|
||||
store atomic i32 0, ptr addrspace(5) %alloca release, align 4
|
||||
store atomic i32 1, ptr addrspace(5) %a1 release, align 4
|
||||
store atomic i32 2, ptr addrspace(5) %a2 release, align 4
|
||||
%tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
|
||||
%data = load i32, ptr addrspace(5) %tmp
|
||||
store i32 %data, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ target datalayout = "A5"
|
||||
; OPT-LABEL: @vector_read_alloca_bitcast(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4
|
||||
; OPT-NEXT: store i32 %0, ptr addrspace(1) %out, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
|
||||
; GCN-ALLOCA-COUNT-4: buffer_store_dword
|
||||
@ -24,20 +24,19 @@ target datalayout = "A5"
|
||||
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) {
|
||||
define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -45,7 +44,7 @@ entry:
|
||||
; OPT-NOT: alloca
|
||||
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
|
||||
; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index
|
||||
; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align
|
||||
; OPT-NEXT: store i32 %1, ptr addrspace(1) %out, align
|
||||
|
||||
; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
|
||||
; GCN-ALLOCA-COUNT-5: buffer_store_dword
|
||||
@ -55,22 +54,21 @@ entry:
|
||||
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
||||
define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 0, i32 addrspace(5)* %y
|
||||
store i32 0, i32 addrspace(5)* %z
|
||||
store i32 0, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
|
||||
store i32 1, i32 addrspace(5)* %tmp1
|
||||
%tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
|
||||
%tmp3 = load i32, i32 addrspace(5)* %tmp2
|
||||
store i32 %tmp3, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 0, ptr addrspace(5) %y
|
||||
store i32 0, ptr addrspace(5) %z
|
||||
store i32 0, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
|
||||
store i32 1, ptr addrspace(5) %tmp1
|
||||
%tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
|
||||
%tmp3 = load i32, ptr addrspace(5) %tmp2
|
||||
store i32 %tmp3, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -78,7 +76,7 @@ entry:
|
||||
; OPT-NOT: alloca
|
||||
; OPT: bb2:
|
||||
; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
|
||||
; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10
|
||||
; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
|
||||
; OPT: .preheader:
|
||||
; OPT: %bc = bitcast <6 x float> %0 to <6 x i32>
|
||||
; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20
|
||||
@ -106,31 +104,28 @@ entry:
|
||||
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) {
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%tmp = alloca [6 x float], align 4, addrspace(5)
|
||||
%tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)*
|
||||
call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2
|
||||
br label %bb2
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
|
||||
%tmp4 = zext i32 %tmp3 to i64
|
||||
%tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4
|
||||
%tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)*
|
||||
%tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
|
||||
%tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4
|
||||
%tmp7 = load i32, ptr addrspace(1) %tmp5, align 4
|
||||
%tmp8 = trunc i32 %tmp3 to i16
|
||||
%tmp9 = urem i16 %tmp8, 6
|
||||
%tmp10 = zext i16 %tmp9 to i32
|
||||
%tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10
|
||||
%tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)*
|
||||
store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4
|
||||
%tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
|
||||
store i32 %tmp7, ptr addrspace(5) %tmp11, align 4
|
||||
%tmp13 = add nuw nsw i32 %tmp3, 1
|
||||
%tmp14 = icmp eq i32 %tmp13, 1000
|
||||
br i1 %tmp14, label %.preheader, label %bb2
|
||||
|
||||
bb15: ; preds = %.preheader
|
||||
call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2
|
||||
ret void
|
||||
|
||||
.preheader: ; preds = %.preheader, %bb2
|
||||
@ -139,13 +134,11 @@ bb15: ; preds = %.preheader
|
||||
%tmp18 = urem i16 %tmp17, 6
|
||||
%tmp19 = sub nuw nsw i16 5, %tmp18
|
||||
%tmp20 = zext i16 %tmp19 to i32
|
||||
%tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20
|
||||
%tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)*
|
||||
%tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4
|
||||
%tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
|
||||
%tmp23 = load i32, ptr addrspace(5) %tmp21, align 4
|
||||
%tmp24 = zext i32 %tmp16 to i64
|
||||
%tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24
|
||||
%tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)*
|
||||
store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4
|
||||
%tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24
|
||||
store i32 %tmp23, ptr addrspace(1) %tmp25, align 4
|
||||
%tmp27 = add nuw nsw i32 %tmp16, 1
|
||||
%tmp28 = icmp eq i32 %tmp27, 1000
|
||||
br i1 %tmp28, label %bb15, label %.preheader
|
||||
@ -155,7 +148,7 @@ bb15: ; preds = %.preheader
|
||||
; OPT-NOT: alloca
|
||||
; OPT: bb2:
|
||||
; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
|
||||
; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10
|
||||
; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
|
||||
; OPT: .preheader:
|
||||
; OPT: %bc = bitcast <6 x double> %0 to <6 x i64>
|
||||
; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20
|
||||
@ -172,31 +165,28 @@ bb15: ; preds = %.preheader
|
||||
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) {
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%tmp = alloca [6 x double], align 8, addrspace(5)
|
||||
%tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)*
|
||||
call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
|
||||
br label %bb2
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
|
||||
%tmp4 = zext i32 %tmp3 to i64
|
||||
%tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4
|
||||
%tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)*
|
||||
%tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8
|
||||
%tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4
|
||||
%tmp7 = load i64, ptr addrspace(1) %tmp5, align 8
|
||||
%tmp8 = trunc i32 %tmp3 to i16
|
||||
%tmp9 = urem i16 %tmp8, 6
|
||||
%tmp10 = zext i16 %tmp9 to i32
|
||||
%tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10
|
||||
%tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)*
|
||||
store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8
|
||||
%tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
|
||||
store i64 %tmp7, ptr addrspace(5) %tmp11, align 8
|
||||
%tmp13 = add nuw nsw i32 %tmp3, 1
|
||||
%tmp14 = icmp eq i32 %tmp13, 1000
|
||||
br i1 %tmp14, label %.preheader, label %bb2
|
||||
|
||||
bb15: ; preds = %.preheader
|
||||
call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
|
||||
ret void
|
||||
|
||||
.preheader: ; preds = %.preheader, %bb2
|
||||
@ -205,13 +195,11 @@ bb15: ; preds = %.preheader
|
||||
%tmp18 = urem i16 %tmp17, 6
|
||||
%tmp19 = sub nuw nsw i16 5, %tmp18
|
||||
%tmp20 = zext i16 %tmp19 to i32
|
||||
%tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20
|
||||
%tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)*
|
||||
%tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8
|
||||
%tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
|
||||
%tmp23 = load i64, ptr addrspace(5) %tmp21, align 8
|
||||
%tmp24 = zext i32 %tmp16 to i64
|
||||
%tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24
|
||||
%tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)*
|
||||
store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8
|
||||
%tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24
|
||||
store i64 %tmp23, ptr addrspace(1) %tmp25, align 8
|
||||
%tmp27 = add nuw nsw i32 %tmp16, 1
|
||||
%tmp28 = icmp eq i32 %tmp27, 1000
|
||||
br i1 %tmp28, label %bb15, label %.preheader
|
||||
@ -237,29 +225,28 @@ bb15: ; preds = %.preheader
|
||||
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) {
|
||||
define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) {
|
||||
bb:
|
||||
%tmp = alloca [6 x i64], align 8, addrspace(5)
|
||||
%tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)*
|
||||
call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
|
||||
br label %bb2
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
|
||||
%tmp4 = zext i32 %tmp3 to i64
|
||||
%tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4
|
||||
%tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8
|
||||
%tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4
|
||||
%tmp6 = load i64, ptr addrspace(1) %tmp5, align 8
|
||||
%tmp7 = trunc i32 %tmp3 to i16
|
||||
%tmp8 = urem i16 %tmp7, 6
|
||||
%tmp9 = zext i16 %tmp8 to i32
|
||||
%tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9
|
||||
store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8
|
||||
%tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9
|
||||
store i64 %tmp6, ptr addrspace(5) %tmp10, align 8
|
||||
%tmp11 = add nuw nsw i32 %tmp3, 1
|
||||
%tmp12 = icmp eq i32 %tmp11, 1000
|
||||
br i1 %tmp12, label %.preheader, label %bb2
|
||||
|
||||
bb13: ; preds = %.preheader
|
||||
call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
|
||||
call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
|
||||
ret void
|
||||
|
||||
.preheader: ; preds = %.preheader, %bb2
|
||||
@ -268,11 +255,11 @@ bb13: ; preds = %.preheader
|
||||
%tmp16 = urem i16 %tmp15, 6
|
||||
%tmp17 = sub nuw nsw i16 5, %tmp16
|
||||
%tmp18 = zext i16 %tmp17 to i32
|
||||
%tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18
|
||||
%tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8
|
||||
%tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18
|
||||
%tmp20 = load i64, ptr addrspace(5) %tmp19, align 8
|
||||
%tmp21 = zext i32 %tmp14 to i64
|
||||
%tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21
|
||||
store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8
|
||||
%tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21
|
||||
store i64 %tmp20, ptr addrspace(1) %tmp22, align 8
|
||||
%tmp23 = add nuw nsw i32 %tmp14, 1
|
||||
%tmp24 = icmp eq i32 %tmp23, 1000
|
||||
br i1 %tmp24, label %bb13, label %.preheader
|
||||
@ -282,27 +269,26 @@ bb13: ; preds = %.preheader
|
||||
|
||||
; OPT-LABEL: @vector_read_alloca_bitcast_assume(
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
|
||||
; OPT: store i32 %0, ptr addrspace(1) %out, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
|
||||
; GCN-COUNT-4: buffer_store_dword
|
||||
|
||||
define amdgpu_kernel void @vector_read_alloca_bitcast_assume(i32 addrspace(1)* %out, i32 %index) {
|
||||
define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
|
||||
%cmp = icmp ne i32 addrspace(5)* %x, null
|
||||
%cmp = icmp ne ptr addrspace(5) %tmp, null
|
||||
call void @llvm.assume(i1 %cmp)
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -310,7 +296,7 @@ entry:
|
||||
; OPT-NOT: alloca
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT-NEXT: %add2 = add nuw nsw i32 %0, 1
|
||||
; OPT-NEXT: store i32 %add2, i32 addrspace(1)* %out, align 4
|
||||
; OPT-NEXT: store i32 %add2, ptr addrspace(1) %out, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}vector_read_alloca_multiuse:
|
||||
; GCN-ALLOCA-COUNT-4: buffer_store_dword
|
||||
@ -328,31 +314,29 @@ entry:
|
||||
|
||||
; GCN-PROMOTE: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_read_alloca_multiuse(i32 addrspace(1)* %out, i32 %index) {
|
||||
define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%b = bitcast [4 x i32] addrspace(5)* %tmp to float addrspace(5)*
|
||||
%x = bitcast float addrspace(5)* %b to i32 addrspace(5)*
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
%tmp3 = load i32, i32 addrspace(5)* %x
|
||||
%tmp4 = load i32, i32 addrspace(5)* %y
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
%tmp3 = load i32, ptr addrspace(5) %tmp
|
||||
%tmp4 = load i32, ptr addrspace(5) %y
|
||||
%add1 = add i32 %tmp2, %tmp3
|
||||
%add2 = add i32 %add1, %tmp4
|
||||
store i32 %add2, i32 addrspace(1)* %out
|
||||
store i32 %add2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @bitcast_vector_to_vector(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
|
||||
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
|
||||
|
||||
; GCN-LABEL: {{^}}bitcast_vector_to_vector:
|
||||
; GCN: v_mov_b32_e32 v0, 1
|
||||
@ -362,19 +346,18 @@ entry:
|
||||
|
||||
; GCN: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) {
|
||||
define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out) {
|
||||
.entry:
|
||||
%alloca = alloca <4 x float>, align 16, addrspace(5)
|
||||
%cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
|
||||
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
|
||||
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
|
||||
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
|
||||
%load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
|
||||
store <4 x i32> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_bitcast_from_alloca_array(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
|
||||
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
|
||||
|
||||
; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
|
||||
; GCN: v_mov_b32_e32 v0, 1
|
||||
@ -384,26 +367,24 @@ define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out
|
||||
|
||||
; GCN: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) {
|
||||
define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out) {
|
||||
.entry:
|
||||
%alloca = alloca [4 x float], align 16, addrspace(5)
|
||||
%cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
|
||||
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
|
||||
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
|
||||
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
|
||||
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
|
||||
%load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
|
||||
store <4 x i32> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
|
||||
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
|
||||
; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
|
||||
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
|
||||
; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
|
||||
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
|
||||
; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
|
||||
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
|
||||
; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
|
||||
; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1
|
||||
; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
|
||||
; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2
|
||||
; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
|
||||
; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3
|
||||
; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
|
||||
; GCN: v_mov_b32_e32 v0, 1
|
||||
@ -413,26 +394,23 @@ define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(
|
||||
|
||||
; GCN: ScratchSize: 0
|
||||
|
||||
define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out) {
|
||||
.entry:
|
||||
define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out) {
|
||||
%alloca = alloca [4 x float], align 16, addrspace(5)
|
||||
%cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
|
||||
store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
|
||||
%load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
|
||||
store [4 x i32] %load, [4 x i32] addrspace(1)* %out
|
||||
store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca
|
||||
%load = load [4 x i32], ptr addrspace(5) %alloca, align 16
|
||||
store [4 x i32] %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
|
||||
; OPT-NOT: alloca
|
||||
; OPT: %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
|
||||
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
|
||||
; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
|
||||
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
|
||||
; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
|
||||
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
|
||||
; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
|
||||
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
|
||||
; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
|
||||
; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1
|
||||
; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
|
||||
; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2
|
||||
; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
|
||||
; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3
|
||||
; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
|
||||
|
||||
; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
|
||||
; GCN: v_mov_b32_e32 v0, 1
|
||||
@ -444,18 +422,16 @@ define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] a
|
||||
|
||||
%struct.v4 = type { i32, i32, i32, i32 }
|
||||
|
||||
define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out) {
|
||||
.entry:
|
||||
define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out) {
|
||||
%alloca = alloca [4 x float], align 16, addrspace(5)
|
||||
%cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
|
||||
store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
|
||||
%load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
|
||||
store %struct.v4 %load, %struct.v4 addrspace(1)* %out
|
||||
store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca
|
||||
%load = load %struct.v4, ptr addrspace(5) %alloca, align 16
|
||||
store %struct.v4 %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
|
||||
declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)
|
||||
|
||||
declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
|
||||
declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)
|
||||
|
||||
declare void @llvm.assume(i1)
|
||||
|
||||
@ -8,14 +8,13 @@ target datalayout = "A5"
|
||||
; OPT: <8 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <8 x i64>
|
||||
define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
|
||||
define amdgpu_kernel void @alloca_8xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
|
||||
entry:
|
||||
%tmp = alloca [8 x i64], addrspace(5)
|
||||
%x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -24,14 +23,13 @@ entry:
|
||||
; OPT-NOT: <9 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i64>
|
||||
define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
|
||||
define amdgpu_kernel void @alloca_9xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i64], addrspace(5)
|
||||
%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -40,14 +38,13 @@ entry:
|
||||
; OPT: <16 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <16 x i64>
|
||||
define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
|
||||
define amdgpu_kernel void @alloca_16xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [16 x i64], addrspace(5)
|
||||
%x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [16 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -56,14 +53,13 @@ entry:
|
||||
; OPT-NOT: <17 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <17 x i64>
|
||||
define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
|
||||
define amdgpu_kernel void @alloca_17xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [17 x i64], addrspace(5)
|
||||
%x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [17 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -72,14 +68,13 @@ entry:
|
||||
; OPT-NOT: <9 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i128>
|
||||
define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
|
||||
define amdgpu_kernel void @alloca_9xi128_max512(ptr addrspace(1) %out, i32 %index) #1 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i128], addrspace(5)
|
||||
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
store i128 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, ptr addrspace(5) %tmp1
|
||||
store i128 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -88,14 +83,13 @@ entry:
|
||||
; OPT: <9 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i128>
|
||||
define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
|
||||
define amdgpu_kernel void @alloca_9xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i128], addrspace(5)
|
||||
%x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
store i128 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, ptr addrspace(5) %tmp1
|
||||
store i128 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -104,14 +98,13 @@ entry:
|
||||
; OPT: <16 x i128>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <16 x i128>
|
||||
define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
|
||||
define amdgpu_kernel void @alloca_16xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [16 x i128], addrspace(5)
|
||||
%x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i128 0, i128 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, i128 addrspace(5)* %tmp1
|
||||
store i128 %tmp2, i128 addrspace(1)* %out
|
||||
store i128 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [16 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i128, ptr addrspace(5) %tmp1
|
||||
store i128 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -120,14 +113,13 @@ entry:
|
||||
; OPT-NOT: <9 x i256>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i256>
|
||||
define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
|
||||
define amdgpu_kernel void @alloca_9xi256_max256(ptr addrspace(1) %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i256], addrspace(5)
|
||||
%x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i256 0, i256 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i256, i256 addrspace(5)* %tmp1
|
||||
store i256 %tmp2, i256 addrspace(1)* %out
|
||||
store i256 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i256, ptr addrspace(5) %tmp1
|
||||
store i256 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -136,14 +128,13 @@ entry:
|
||||
; OPT: <9 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i64>
|
||||
define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
|
||||
define amdgpu_kernel void @alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i64], addrspace(5)
|
||||
%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -152,14 +143,13 @@ entry:
|
||||
; OPT-NOT: <9 x i64>
|
||||
; LIMIT32: alloca
|
||||
; LIMIT32-NOT: <9 x i64>
|
||||
define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
|
||||
define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
|
||||
entry:
|
||||
%tmp = alloca [9 x i64], addrspace(5)
|
||||
%x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
|
||||
store i64 0, i64 addrspace(5)* %x
|
||||
%tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, i64 addrspace(5)* %tmp1
|
||||
store i64 %tmp2, i64 addrspace(1)* %out
|
||||
store i64 0, ptr addrspace(5) %tmp
|
||||
%tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i64, ptr addrspace(5) %tmp1
|
||||
store i64 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ target datalayout = "A5"
|
||||
|
||||
; OPT-LABEL: @vector_read(
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
|
||||
; OPT: store i32 %0, ptr addrspace(1) %out, align 4
|
||||
|
||||
; FUNC-LABEL: {{^}}vector_read:
|
||||
; EG: MOV
|
||||
@ -17,27 +17,26 @@ target datalayout = "A5"
|
||||
; EG: MOV
|
||||
; EG: MOV
|
||||
; EG: MOVA_INT
|
||||
define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
|
||||
define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_write(
|
||||
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
|
||||
; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
|
||||
; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
|
||||
; OPT: store i32 %1, ptr addrspace(1) %out, align 4
|
||||
|
||||
; FUNC-LABEL: {{^}}vector_write:
|
||||
; EG: MOV
|
||||
@ -46,91 +45,83 @@ entry:
|
||||
; EG: MOV
|
||||
; EG: MOVA_INT
|
||||
; EG: MOVA_INT
|
||||
define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
||||
define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 0, i32 addrspace(5)* %y
|
||||
store i32 0, i32 addrspace(5)* %z
|
||||
store i32 0, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
|
||||
store i32 1, i32 addrspace(5)* %tmp1
|
||||
%tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
|
||||
%tmp3 = load i32, i32 addrspace(5)* %tmp2
|
||||
store i32 %tmp3, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 0, ptr addrspace(5) %y
|
||||
store i32 0, ptr addrspace(5) %z
|
||||
store i32 0, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
|
||||
store i32 1, ptr addrspace(5) %tmp1
|
||||
%tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
|
||||
%tmp3 = load i32, ptr addrspace(5) %tmp2
|
||||
store i32 %tmp3, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; This test should be optimize to:
|
||||
; store i32 0, i32 addrspace(1)* %out
|
||||
; store i32 0, ptr addrspace(1) %out
|
||||
|
||||
; OPT-LABEL: @bitcast_gep(
|
||||
; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
|
||||
; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4
|
||||
|
||||
; FUNC-LABEL: {{^}}bitcast_gep:
|
||||
; EG: STORE_RAW
|
||||
define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
|
||||
define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 0, i32 addrspace(5)* %y
|
||||
store i32 0, i32 addrspace(5)* %z
|
||||
store i32 0, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
|
||||
%tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
|
||||
%tmp4 = load i32, i32 addrspace(5)* %tmp3
|
||||
store i32 %tmp4, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 0, ptr addrspace(5) %y
|
||||
store i32 0, ptr addrspace(5) %z
|
||||
store i32 0, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%tmp4 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp4, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_read_bitcast_gep(
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
|
||||
define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
|
||||
; OPT: store i32 %0, ptr addrspace(1) %out, align 4
|
||||
define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
%bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
|
||||
store float 1.0, float addrspace(5)* %bc
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
%y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store float 1.0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @vector_read_bitcast_alloca(
|
||||
; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
|
||||
; OPT: store float %0, float addrspace(1)* %out, align 4
|
||||
define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
|
||||
; OPT: store float %0, ptr addrspace(1) %out, align 4
|
||||
define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
|
||||
%x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
|
||||
%y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
|
||||
store float 0.0, float addrspace(5)* %x
|
||||
store float 1.0, float addrspace(5)* %y
|
||||
store float 2.0, float addrspace(5)* %z
|
||||
store float 4.0, float addrspace(5)* %w
|
||||
%tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
|
||||
%tmp2 = load float, float addrspace(5)* %tmp1
|
||||
store float %tmp2, float addrspace(1)* %out
|
||||
%y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store float 0.0, ptr addrspace(5) %tmp
|
||||
store float 1.0, ptr addrspace(5) %y
|
||||
store float 2.0, ptr addrspace(5) %z
|
||||
store float 4.0, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load float, ptr addrspace(5) %tmp1
|
||||
store float %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -138,20 +129,19 @@ entry:
|
||||
|
||||
; OPT-LABEL: @vector_read_with_local_arg(
|
||||
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
|
||||
; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
|
||||
define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
|
||||
; OPT: store i32 %0, ptr addrspace(1) %out, align 4
|
||||
define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) {
|
||||
entry:
|
||||
%tmp = alloca [4 x i32], addrspace(5)
|
||||
%x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
|
||||
%y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
|
||||
store i32 0, i32 addrspace(5)* %x
|
||||
store i32 1, i32 addrspace(5)* %y
|
||||
store i32 2, i32 addrspace(5)* %z
|
||||
store i32 3, i32 addrspace(5)* %w
|
||||
%tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, i32 addrspace(5)* %tmp1
|
||||
store i32 %tmp2, i32 addrspace(1)* %out
|
||||
%y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
|
||||
%z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
|
||||
%w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
|
||||
store i32 0, ptr addrspace(5) %tmp
|
||||
store i32 1, ptr addrspace(5) %y
|
||||
store i32 2, ptr addrspace(5) %z
|
||||
store i32 3, ptr addrspace(5) %w
|
||||
%tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
|
||||
%tmp2 = load i32, ptr addrspace(5) %tmp1
|
||||
store i32 %tmp2, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user