llvm-project/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
Matt Arsenault c5fe075eaf
AMDGPU: Use freeze poison instead of undef in alloca promotion (#131285)
Previously the value created to represent the uninitialized memory
of the alloca was undef. Use freeze poison instead. Enables some
optimization improvements (which need defeating in the limit tests),
but also a few regressions. Seems to leave behind dead code in some
cases too.
2025-03-18 17:27:02 +07:00

415 lines
24 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=sroa,amdgpu-promote-alloca < %s | FileCheck %s
; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly
; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but
; the pass should handle it gracefully if it is
; The checks look for lines that previously caused issues in PromoteAlloca (non-canonical). Opt
; should now leave these unchanged
%Block = type { [1 x float], i32 }
%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] }
%struct = type { i32, i32 }
@block = external addrspace(1) global %Block
@pv = external addrspace(1) global %gl_PerVertex
define amdgpu_vs void @promote_1d_aggr() #0 {
; CHECK-LABEL: @promote_1d_aggr(
; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [1 x float] [[FOO3]], 0
; CHECK-NEXT: [[FOO3_FCA_0_GEP:%.*]] = getelementptr inbounds [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT: store float [[FOO3_FCA_0_EXTRACT]], ptr addrspace(5) [[FOO3_FCA_0_GEP]], align 4
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO1]]
; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[FOO6]], i32 0
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [1 x float], addrspace(5)
%foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
%foo1 = load i32, ptr addrspace(1) %foo
store i32 %foo1, ptr addrspace(5) %i
%foo3 = load [1 x float], ptr addrspace(1) @block
store [1 x float] %foo3, ptr addrspace(5) %f1
%foo4 = load i32, ptr addrspace(5) %i
%foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
%foo6 = load float, ptr addrspace(5) %foo5
%foo7 = alloca <4 x float>, addrspace(5)
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
store <4 x float> %foo12, ptr addrspace(1) @pv
ret void
}
%Block2 = type { i32, [2 x float] }
@block2 = external addrspace(1) global %Block2
define amdgpu_vs void @promote_store_aggr() #0 {
; CHECK-LABEL: @promote_store_aggr(
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO1]] to float
; CHECK-NEXT: [[FOO6_FCA_0_INSERT:%.*]] = insertvalue [2 x float] poison, float [[FOO3]], 0
; CHECK-NEXT: [[FOO6_FCA_1_INSERT:%.*]] = insertvalue [2 x float] [[FOO6_FCA_0_INSERT]], float 2.000000e+00, 1
; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
; CHECK-NEXT: store [2 x float] [[FOO6_FCA_1_INSERT]], ptr addrspace(1) [[FOO7]], align 4
; CHECK-NEXT: store <4 x float> splat (float 1.000000e+00), ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [2 x float], addrspace(5)
%foo1 = load i32, ptr addrspace(1) @block2
store i32 %foo1, ptr addrspace(5) %i
%foo2 = load i32, ptr addrspace(5) %i
%foo3 = sitofp i32 %foo2 to float
store float %foo3, ptr addrspace(5) %f1
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
store float 2.000000e+00, ptr addrspace(5) %foo5
%foo6 = load [2 x float], ptr addrspace(5) %f1
%foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
store [2 x float] %foo6, ptr addrspace(1) %foo7
store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
ret void
}
%Block3 = type { [2 x float], i32 }
@block3 = external addrspace(1) global %Block3
define amdgpu_vs void @promote_load_from_store_aggr() #0 {
; CHECK-LABEL: @promote_load_from_store_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <2 x float> poison
; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
; CHECK-NEXT: [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> [[F1]], float [[FOO3_FCA_0_EXTRACT]], i32 0
; CHECK-NEXT: [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [2 x float], addrspace(5)
%foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
%foo1 = load i32, ptr addrspace(1) %foo
store i32 %foo1, ptr addrspace(5) %i
%foo3 = load [2 x float], ptr addrspace(1) @block3
store [2 x float] %foo3, ptr addrspace(5) %f1
%foo4 = load i32, ptr addrspace(5) %i
%foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
%foo6 = load float, ptr addrspace(5) %foo5
%foo7 = alloca <4 x float>, addrspace(5)
%foo8 = load <4 x float>, ptr addrspace(5) %foo7
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
store <4 x float> %foo12, ptr addrspace(1) @pv
ret void
}
%Block4 = type { [2 x i32], i32 }
@block4 = external addrspace(1) global %Block4
%gl_PV = type { <4 x i32>, i32, [1 x i32], [1 x i32] }
@pv1 = external addrspace(1) global %gl_PV
; This should not crash on an aliased variable offset that can be
; optimized out (variable %aliasTofoo3 in the test)
define amdgpu_vs void @promote_load_from_store_aggr_varoff(<4 x i32> %input) {
; CHECK-LABEL: @promote_load_from_store_aggr_varoff(
; CHECK-NEXT: [[F1:%.*]] = freeze <3 x i32> poison
; CHECK-NEXT: [[FOO3_UNPACK2:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x i32> [[F1]], i32 [[FOO3_UNPACK2]], i32 2
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <3 x i32> [[TMP1]], i32 [[FOO3_UNPACK2]]
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x i32> [[INPUT:%.*]], i32 [[TMP2]], i64 3
; CHECK-NEXT: store <4 x i32> [[FOO12]], ptr addrspace(1) @pv1, align 16
; CHECK-NEXT: ret void
;
%f1 = alloca [3 x i32], align 4, addrspace(5)
%G1 = getelementptr inbounds i8, ptr addrspace(5) %f1, i32 8
%foo3.unpack2 = load i32, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @block4, i64 8), align 4
store i32 %foo3.unpack2, ptr addrspace(5) %G1, align 4
%aliasTofoo3 = load i32, ptr addrspace(5) %G1, align 4
%foo5 = getelementptr [3 x i32], ptr addrspace(5) %f1, i32 0, i32 %aliasTofoo3
%foo6 = load i32, ptr addrspace(5) %foo5, align 4
%foo12 = insertelement <4 x i32> %input, i32 %foo6, i64 3
store <4 x i32> %foo12, ptr addrspace(1) @pv1, align 16
ret void
}
define amdgpu_vs void @promote_memmove_aggr() #0 {
; CHECK-LABEL: @promote_memmove_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <5 x float> [[TMP1]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <5 x float> [[TMP2]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP3]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 1.000000e+00, i32 1
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 2.000000e+00, i32 3
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x float> [[TMP7]], <5 x float> poison, <5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 4>
; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
%foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
store float 1.0, ptr addrspace(5) %foo1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
store float 2.0, ptr addrspace(5) %foo2
call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
%foo3 = load float, ptr addrspace(5) %f1
store float %foo3, ptr addrspace(1) @pv
ret void
}
define amdgpu_vs void @promote_memcpy_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP9]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[TMP5]], float 2.000000e+00, i32 3
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP6]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
store float 2.0, ptr addrspace(5) %foo2
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
%foo4 = load i32, ptr addrspace(1) %foo3
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
%foo6 = load float, ptr addrspace(5) %f1
store float %foo6, ptr addrspace(1) @pv
ret void
}
define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_identity_aggr(
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
%foo1 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 1
store float 1.0, ptr addrspace(5) %foo1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
store float 2.0, ptr addrspace(5) %foo2
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
%foo3 = load float, ptr addrspace(5) %f1
store float %foo3, ptr addrspace(1) @pv
ret void
}
; TODO: promote alloca even there is a memcpy between different alloca
define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
; CHECK-LABEL: @promote_memcpy_two_aggrs(
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT: [[F2:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT: [[DOTFCA_0_GEP1:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP1]], align 4
; CHECK-NEXT: [[DOTFCA_1_GEP2:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP2]], align 4
; CHECK-NEXT: [[DOTFCA_2_GEP3:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP3]], align 4
; CHECK-NEXT: [[DOTFCA_3_GEP4:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP4]], align 4
; CHECK-NEXT: [[DOTFCA_4_GEP5:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP5]], align 4
; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 0
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 1
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 2
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 3
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 4
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 [[F2]], ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F2]], i32 0, i32 [[FOO4]]
; CHECK-NEXT: [[FOO7:%.*]] = load float, ptr addrspace(5) [[FOO6]], align 4
; CHECK-NEXT: store float [[FOO7]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
%f2 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
store [5 x float] zeroinitializer, ptr addrspace(5) %f2
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
%foo4 = load i32, ptr addrspace(1) %foo3
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
%foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
%foo7 = load float, ptr addrspace(5) %foo6
store float %foo7, ptr addrspace(1) @pv
ret void
}
; TODO: promote alloca even there is a memcpy between the alloca and other memory space.
define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0 {
; CHECK-LABEL: @promote_memcpy_p1p5_aggr(
; CHECK-NEXT: [[F1:%.*]] = alloca [5 x float], align 4, addrspace(5)
; CHECK-NEXT: [[DOTFCA_0_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 0
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_0_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_1_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_1_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_2_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 2
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_2_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_3_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 3
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_3_GEP]], align 4
; CHECK-NEXT: [[DOTFCA_4_GEP:%.*]] = getelementptr inbounds [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 4
; CHECK-NEXT: store float 0.000000e+00, ptr addrspace(5) [[DOTFCA_4_GEP]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [5 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT: call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 [[F1]], i32 8, i1 false)
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
%foo4 = load i32, ptr addrspace(1) %foo3
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
ret void
}
define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
; CHECK-LABEL: @promote_memcpy_inline_aggr(
; CHECK-NEXT: [[F1:%.*]] = freeze <5 x float> poison
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <5 x float> [[F1]], float 0.000000e+00, i32 0
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <5 x float> [[TMP6]], float 0.000000e+00, i32 1
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <5 x float> [[TMP7]], float 0.000000e+00, i32 2
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x float> [[TMP8]], float 0.000000e+00, i32 3
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <5 x float> [[TMP4]], float 0.000000e+00, i32 4
; CHECK-NEXT: [[FOO3:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 0
; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(1) [[FOO3]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <5 x float> [[TMP5]], float 3.000000e+00, i32 [[FOO4]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x float> [[TMP1]], <5 x float> poison, <5 x i32> <i32 3, i32 4, i32 2, i32 3, i32 4>
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <5 x float> [[TMP2]], i32 0
; CHECK-NEXT: store float [[TMP3]], ptr addrspace(1) @pv, align 4
; CHECK-NEXT: ret void
;
%f1 = alloca [5 x float], addrspace(5)
store [5 x float] zeroinitializer, ptr addrspace(5) %f1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
%foo3 = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 0
%foo4 = load i32, ptr addrspace(1) %foo3
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
%foo6 = load float, ptr addrspace(5) %f1
store float %foo6, ptr addrspace(1) @pv
ret void
}
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
@frag_color = external addrspace(1) global <4 x float>
define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-LABEL: @promote_double_aggr(
; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] poison, double [[FOO1]], 0
; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
; CHECK-NEXT: [[FOO5_FCA_0_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 0
; CHECK-NEXT: [[FOO5_FCA_1_EXTRACT:%.*]] = extractvalue [2 x double] [[FOO5]], 1
; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[FOO5_FCA_1_EXTRACT]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[FOO10]], [[FOO5_FCA_1_EXTRACT]]
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> poison, float [[FOO17]], i32 0
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
; CHECK-NEXT: ret void
;
%s = alloca [2 x double], addrspace(5)
%foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
%foo1 = load double, ptr addrspace(1) %foo
%foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
%foo3 = load double, ptr addrspace(1) %foo2
%foo4 = insertvalue [2 x double] poison, double %foo1, 0
%foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
store [2 x double] %foo5, ptr addrspace(5) %s
%foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
%foo7 = load double, ptr addrspace(5) %foo6
%foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
%foo9 = load double, ptr addrspace(5) %foo8
%foo10 = fadd double %foo7, %foo9
store double %foo10, ptr addrspace(5) %s
%foo13 = load double, ptr addrspace(5) %s
%foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
%foo15 = load double, ptr addrspace(5) %foo14
%foo16 = fadd double %foo13, %foo15
%foo17 = fptrunc double %foo16 to float
%foo18 = insertelement <4 x float> poison, float %foo17, i32 0
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
store <4 x float> %foo21, ptr addrspace(1) @frag_color
ret void
}
; Don't crash on a type that isn't a valid vector element.
define amdgpu_kernel void @alloca_struct() #0 {
; CHECK-LABEL: @alloca_struct(
; CHECK-NEXT: entry:
; CHECK-NEXT: ret void
;
entry:
%alloca = alloca [2 x %struct], align 4, addrspace(5)
ret void
}