
Previously the value created to represent the uninitialized memory of the alloca was undef. Use freeze poison instead. Enables some optimization improvements (which need defeating in the limit tests), but also a few regressions. Seems to leave behind dead code in some cases too.
415 lines
24 KiB
LLVM
415 lines
24 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
|
|
|
|
; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
|
|
; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
|
|
; the pass should handle it gracefully if it is
|
|
; The checks look for lines that previously caused issues in PromoteAlloca (non-canonical). Opt
|
|
; should now leave these unchanged
|
|
|
|
%Block = type { [1 x float], i32 }
|
|
%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] }
|
|
%struct = type { i32, i32 }
|
|
|
|
@block = external addrspace(1) global %Block
|
|
@pv = external addrspace(1) global %gl_PerVertex
|
|
|
|
define amdgpu_vs void @promote_1d_aggr() #0 {
|
|
; CHECK-LABEL: @promote_1d_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
|
|
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
|
|
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
|
|
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
|
|
; CHECK-NEXT: [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
|
|
; CHECK-NEXT: store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
|
|
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
|
|
; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
|
|
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
|
|
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
|
|
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
|
|
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
|
|
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%i = alloca i32, addrspace(5)
|
|
%f1 = alloca [1 x float], addrspace(5)
|
|
%foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
|
|
%foo1 = load i32, ptr addrspace(1) %foo
|
|
store i32 %foo1, ptr addrspace(5) %i
|
|
%foo3 = load [1 x float], ptr addrspace(1) @block
|
|
store [1 x float] %foo3, ptr addrspace(5) %f1
|
|
%foo4 = load i32, ptr addrspace(5) %i
|
|
%foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
%foo6 = load float, ptr addrspace(5) %foo5
|
|
%foo7 = alloca <4 x float>, addrspace(5)
|
|
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
|
|
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
|
|
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
|
|
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
|
|
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
|
|
store <4 x float> %foo12, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
%Block2 = type { i32, [2 x float] }
|
|
@block2 = external addrspace(1) global %Block2
|
|
|
|
define amdgpu_vs void @promote_store_aggr() #0 {
|
|
; CHECK-LABEL: @promote_store_aggr(
|
|
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
|
|
; CHECK-NEXT: [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
|
|
; CHECK-NEXT: [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
|
|
; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
|
|
; CHECK-NEXT: store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
|
|
; CHECK-NEXT: store <4 x float> splat (float 1.000000e+00), ptr addrspace(1) @pv, align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%i = alloca i32, addrspace(5)
|
|
%f1 = alloca [2 x float], addrspace(5)
|
|
%foo1 = load i32, ptr addrspace(1) @block2
|
|
store i32 %foo1, ptr addrspace(5) %i
|
|
%foo2 = load i32, ptr addrspace(5) %i
|
|
%foo3 = sitofp i32 %foo2 to float
|
|
store float %foo3, ptr addrspace(5) %f1
|
|
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
|
|
store float 2.000000e+00, ptr addrspace(5) %foo5
|
|
%foo6 = load [2 x float], ptr addrspace(5) %f1
|
|
%foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
|
|
store [2 x float] %foo6, ptr addrspace(1) %foo7
|
|
store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
%Block3 = type { [2 x float], i32 }
|
|
@block3 = external addrspace(1) global %Block3
|
|
|
|
define amdgpu_vs void @promote_load_from_store_aggr() #0 {
|
|
; CHECK-LABEL: @promote_load_from_store_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = freeze <2 x float> poison
|
|
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
|
|
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
|
|
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
|
|
; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
|
|
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
|
|
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
|
|
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
|
|
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
|
|
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%i = alloca i32, addrspace(5)
|
|
%f1 = alloca [2 x float], addrspace(5)
|
|
%foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
|
|
%foo1 = load i32, ptr addrspace(1) %foo
|
|
store i32 %foo1, ptr addrspace(5) %i
|
|
%foo3 = load [2 x float], ptr addrspace(1) @block3
|
|
store [2 x float] %foo3, ptr addrspace(5) %f1
|
|
%foo4 = load i32, ptr addrspace(5) %i
|
|
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
%foo6 = load float, ptr addrspace(5) %foo5
|
|
%foo7 = alloca <4 x float>, addrspace(5)
|
|
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
|
|
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
|
|
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
|
|
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
|
|
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
|
|
store <4 x float> %foo12, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
%Block4 = type { [2 x i32], i32 }
|
|
@block4 = external addrspace(1) global %Block4
|
|
%gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
|
|
@pv1 = external addrspace(1) global %gl_PV
|
|
|
|
; This should not crash on an aliased variable offset that can be
|
|
; optimized out (variable %aliasTofoo3 in the test)
|
|
define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
|
|
; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
|
|
; CHECK-NEXT: [[F1:%.*]] = freeze <3 x i32> poison
|
|
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
|
|
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
|
|
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
|
|
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [3 x i32], align 4, addrspace(5)
|
|
%G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
|
|
%foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
|
|
store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
|
|
%aliasTofoo3 = load i32, ptr addrspace(5) %G1, align 4
|
|
%foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %aliasTofoo3
|
|
%foo6 = load i32, ptr addrspace(5) %foo5, align 4
|
|
%foo12 = insertelement <4 x i32> %input, i32 %foo6, i64 3
|
|
store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_vs void @promote_memmove_aggr() #0 {
|
|
; CHECK-LABEL: @promote_memmove_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
|
|
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
|
|
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
|
|
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
|
|
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
%foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
|
|
store float 1.0, ptr addrspace(5) %foo1
|
|
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
|
|
store float 2.0, ptr addrspace(5) %foo2
|
|
call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
|
|
%foo3 = load float, ptr addrspace(5) %f1
|
|
store float %foo3, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_vs void @promote_memcpy_aggr() #0 {
|
|
; CHECK-LABEL: @promote_memcpy_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
|
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
|
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
|
|
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
|
|
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
|
|
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
|
|
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
|
|
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
|
|
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
|
|
store float 2.0, ptr addrspace(5) %foo2
|
|
|
|
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
|
|
%foo4 = load i32, ptr addrspace(1) %foo3
|
|
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
store float 3.0, ptr addrspace(5) %foo5
|
|
|
|
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
|
|
%foo6 = load float, ptr addrspace(5) %f1
|
|
store float %foo6, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
|
|
; CHECK-LABEL: @promote_memcpy_identity_aggr(
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) @pv, align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
%foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
|
|
store float 1.0, ptr addrspace(5) %foo1
|
|
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
|
|
store float 2.0, ptr addrspace(5) %foo2
|
|
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
|
|
%foo3 = load float, ptr addrspace(5) %f1
|
|
store float %foo3, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
; TODO: promote alloca even there is a memcpy between different alloca
|
|
define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
|
|
; CHECK-LABEL: @promote_memcpy_two_aggrs(
|
|
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
|
|
; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
|
|
; CHECK-NEXT: [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
|
|
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
|
|
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
|
|
; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
|
|
; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
|
|
; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]]
|
|
; CHECK-NEXT: [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4
|
|
; CHECK-NEXT: store float [[FOO7]], ptr addrspace(1) @pv, align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
%f2 = alloca [5 x float], addrspace(5)
|
|
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f2
|
|
|
|
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
|
|
%foo4 = load i32, ptr addrspace(1) %foo3
|
|
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
store float 3.0, ptr addrspace(5) %foo5
|
|
|
|
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
|
|
|
|
%foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
|
|
%foo7 = load float, ptr addrspace(5) %foo6
|
|
store float %foo7, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
; TODO: promote alloca even there is a memcpy between the alloca and other memory space.
|
|
define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
|
|
; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
|
|
; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
|
|
; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
|
|
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
|
|
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
|
|
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
|
|
; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
|
|
; CHECK-NEXT: call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
|
|
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
|
|
%foo4 = load i32, ptr addrspace(1) %foo3
|
|
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
store float 3.0, ptr addrspace(5) %foo5
|
|
|
|
call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
|
|
; CHECK-LABEL: @promote_memcpy_inline_aggr(
|
|
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
|
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
|
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
|
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
|
|
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
|
|
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
|
|
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
|
|
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
|
|
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%f1 = alloca [5 x float], addrspace(5)
|
|
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
|
|
|
|
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
|
|
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
|
|
%foo4 = load i32, ptr addrspace(1) %foo3
|
|
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
|
|
store float 3.0, ptr addrspace(5) %foo5
|
|
|
|
call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
|
|
%foo6 = load float, ptr addrspace(5) %f1
|
|
store float %foo6, ptr addrspace(1) @pv
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
|
|
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
|
|
declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
|
|
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
|
|
|
|
@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
|
|
@frag_color = external addrspace(1) global <4 x float>
|
|
|
|
define amdgpu_ps void @promote_double_aggr() #0 {
|
|
; CHECK-LABEL: @promote_double_aggr(
|
|
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
|
|
; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
|
|
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
|
|
; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
|
|
; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] poison, double [[FOO1]], 0
|
|
; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
|
|
; CHECK-NEXT: [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
|
|
; CHECK-NEXT: [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
|
|
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
|
|
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
|
|
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
|
|
; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> poison, float [[FOO17]], i32 0
|
|
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
|
|
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
|
|
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
|
|
; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%s = alloca [2 x double], addrspace(5)
|
|
%foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
|
|
%foo1 = load double, ptr addrspace(1) %foo
|
|
%foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
|
|
%foo3 = load double, ptr addrspace(1) %foo2
|
|
%foo4 = insertvalue [2 x double] poison, double %foo1, 0
|
|
%foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
|
|
store [2 x double] %foo5, ptr addrspace(5) %s
|
|
%foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
|
%foo7 = load double, ptr addrspace(5) %foo6
|
|
%foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
|
%foo9 = load double, ptr addrspace(5) %foo8
|
|
%foo10 = fadd double %foo7, %foo9
|
|
store double %foo10, ptr addrspace(5) %s
|
|
%foo13 = load double, ptr addrspace(5) %s
|
|
%foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
|
|
%foo15 = load double, ptr addrspace(5) %foo14
|
|
%foo16 = fadd double %foo13, %foo15
|
|
%foo17 = fptrunc double %foo16 to float
|
|
%foo18 = insertelement <4 x float> poison, float %foo17, i32 0
|
|
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
|
|
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
|
|
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
|
|
store <4 x float> %foo21, ptr addrspace(1) @frag_color
|
|
ret void
|
|
}
|
|
|
|
; Don't crash on a type that isn't a valid vector element.
|
|
define amdgpu_kernel void @alloca_struct() #0 {
|
|
; CHECK-LABEL: @alloca_struct(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
%alloca = alloca [2 x %struct], align 4, addrspace(5)
|
|
ret void
|
|
}
|