The original commit exposed several missing dependencies (e.g. latent bugs in SLP scheduling). Most of these were fixed over the weekend and have had several days to bake. The last was fixed this morning after being noticed in manual review of test changes yesterday. See the review thread for links to each change. Original commit message follows: SLP currently schedules all instructions within a scheduling window which stretches from the first instruction potentially vectorized to the last. This window can include a very large number of unrelated instructions which are not being considered for vectorization. This change switches the code to only schedule the sub-graph consisting of the instructions being vectorized and their transitive users. This has the effect of greatly reducing the amount of work performed in large basic blocks, and thus greatly improves compile time on degenerate examples. To understand the effects, I added some statistics (not planned for upstream contribution). Here's an illustration from my motivating example: Before this patch: 704357 SLP - Number of calcDeps actions 699021 SLP - Number of schedule calls 5598 SLP - Number of ReSchedule actions 59 SLP - Number of ReScheduleOnFail actions 10084 SLP - Number of schedule resets 8523 SLP - Number of vector instructions generated After this patch: 102895 SLP - Number of calcDeps actions 161916 SLP - Number of schedule calls 5637 SLP - Number of ReSchedule actions 55 SLP - Number of ReScheduleOnFail actions 10083 SLP - Number of schedule resets 8403 SLP - Number of vector instructions generated I do want to highlight that there is a small difference in number of generated vector instructions. This example is hitting the bailout due to maximum window size, and the change in scheduling is slightly perturbing when and how we hit it. This can be seen in the RescheduleOnFail counter change. Given that, I think we can safely ignore. The downside of this change can be seen in the large test diff. We group all vectorizable instructions together at the bottom of the scheduling region. This means that vector instructions can move quite far from their original point in code. While maybe undesirable, I don't see this as being a major problem as this pass is not intended to be a general scheduling pass. For context, it's worth noting that the pre-scheduling that SLP does while building the vector tree is exactly the sub-graph scheduling implemented by this patch. Differential Revision: https://reviews.llvm.org/D118538
880 lines
62 KiB
LLVM
880 lines
62 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F
|
|
; RUN: opt < %s -slp-vectorizer -instcombine -S -mtriple=x86_64-unknown-linux -mattr=+avx512vl | FileCheck %s --check-prefixes=CHECK,AVX512VL
|
|
|
|
define void @gather_load(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
|
|
; CHECK-LABEL: @gather_load(
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
|
|
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i64 0
|
|
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP6]], i64 1
|
|
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 2
|
|
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 3
|
|
; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
|
|
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
|
|
; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%3 = getelementptr inbounds i32, i32* %1, i64 1
|
|
%4 = load i32, i32* %1, align 4, !tbaa !2
|
|
%5 = getelementptr inbounds i32, i32* %0, i64 1
|
|
%6 = getelementptr inbounds i32, i32* %1, i64 11
|
|
%7 = load i32, i32* %6, align 4, !tbaa !2
|
|
%8 = getelementptr inbounds i32, i32* %0, i64 2
|
|
%9 = getelementptr inbounds i32, i32* %1, i64 4
|
|
%10 = load i32, i32* %9, align 4, !tbaa !2
|
|
%11 = getelementptr inbounds i32, i32* %0, i64 3
|
|
%12 = load i32, i32* %3, align 4, !tbaa !2
|
|
%13 = insertelement <4 x i32> undef, i32 %4, i32 0
|
|
%14 = insertelement <4 x i32> %13, i32 %7, i32 1
|
|
%15 = insertelement <4 x i32> %14, i32 %10, i32 2
|
|
%16 = insertelement <4 x i32> %15, i32 %12, i32 3
|
|
%17 = add nsw <4 x i32> %16, <i32 1, i32 2, i32 3, i32 4>
|
|
%18 = bitcast i32* %0 to <4 x i32>*
|
|
store <4 x i32> %17, <4 x i32>* %18, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
define void @gather_load_2(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
|
|
; SSE-LABEL: @gather_load_2(
|
|
; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
|
|
; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
|
|
; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
|
|
; SSE-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
|
|
; SSE-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
|
|
; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
|
|
; SSE-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
|
|
; SSE-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
|
|
; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
|
|
; SSE-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
|
|
; SSE-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
|
|
; SSE-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @gather_load_2(
|
|
; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
|
|
; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
|
|
; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
|
|
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
|
|
; AVX-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
|
|
; AVX-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
|
|
; AVX-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
|
|
; AVX-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
|
|
; AVX-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
|
|
; AVX-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
|
|
; AVX-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: ret void
|
|
;
|
|
; AVX2-LABEL: @gather_load_2(
|
|
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
|
|
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
|
|
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
|
|
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
|
|
; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0
|
|
; AVX2-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i64 1
|
|
; AVX2-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i64 2
|
|
; AVX2-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3
|
|
; AVX2-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], <i32 1, i32 2, i32 3, i32 4>
|
|
; AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
|
|
; AVX2-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: ret void
|
|
;
|
|
; AVX512F-LABEL: @gather_load_2(
|
|
; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 1
|
|
; AVX512F-NEXT: [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
|
|
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
|
|
; AVX512F-NEXT: store i32 [[TMP5]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 10
|
|
; AVX512F-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
|
|
; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
|
|
; AVX512F-NEXT: store i32 [[TMP9]], i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 3
|
|
; AVX512F-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
|
|
; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
|
|
; AVX512F-NEXT: store i32 [[TMP13]], i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 5
|
|
; AVX512F-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
|
|
; AVX512F-NEXT: store i32 [[TMP17]], i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: ret void
|
|
;
|
|
; AVX512VL-LABEL: @gather_load_2(
|
|
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0
|
|
; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 1, i64 10, i64 3, i64 5>
|
|
; AVX512VL-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 1, i32 2, i32 3, i32 4>
|
|
; AVX512VL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP0:%.*]] to <4 x i32>*
|
|
; AVX512VL-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: ret void
|
|
;
|
|
%3 = getelementptr inbounds i32, i32* %1, i64 1
|
|
%4 = load i32, i32* %3, align 4, !tbaa !2
|
|
%5 = add nsw i32 %4, 1
|
|
%6 = getelementptr inbounds i32, i32* %0, i64 1
|
|
store i32 %5, i32* %0, align 4, !tbaa !2
|
|
%7 = getelementptr inbounds i32, i32* %1, i64 10
|
|
%8 = load i32, i32* %7, align 4, !tbaa !2
|
|
%9 = add nsw i32 %8, 2
|
|
%10 = getelementptr inbounds i32, i32* %0, i64 2
|
|
store i32 %9, i32* %6, align 4, !tbaa !2
|
|
%11 = getelementptr inbounds i32, i32* %1, i64 3
|
|
%12 = load i32, i32* %11, align 4, !tbaa !2
|
|
%13 = add nsw i32 %12, 3
|
|
%14 = getelementptr inbounds i32, i32* %0, i64 3
|
|
store i32 %13, i32* %10, align 4, !tbaa !2
|
|
%15 = getelementptr inbounds i32, i32* %1, i64 5
|
|
%16 = load i32, i32* %15, align 4, !tbaa !2
|
|
%17 = add nsw i32 %16, 4
|
|
store i32 %17, i32* %14, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture readonly %1) {
|
|
; SSE-LABEL: @gather_load_3(
|
|
; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1
|
|
; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
|
|
; SSE-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
|
|
; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2
|
|
; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
|
|
; SSE-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
|
|
; SSE-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3
|
|
; SSE-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
|
|
; SSE-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
|
|
; SSE-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4
|
|
; SSE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
|
|
; SSE-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
|
|
; SSE-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1
|
|
; SSE-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
|
|
; SSE-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
|
|
; SSE-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2
|
|
; SSE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
|
|
; SSE-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
|
|
; SSE-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3
|
|
; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
|
|
; SSE-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
|
|
; SSE-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4
|
|
; SSE-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @gather_load_3(
|
|
; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
|
|
; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
|
|
; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
|
|
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
|
|
; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
|
|
; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
|
|
; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
|
|
; AVX-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
|
|
; AVX-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
|
|
; AVX-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
|
|
; AVX-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
|
|
; AVX-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
|
|
; AVX-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
|
|
; AVX-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
|
|
; AVX-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
|
|
; AVX-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
|
|
; AVX-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
|
|
; AVX-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: ret void
|
|
;
|
|
; AVX2-LABEL: @gather_load_3(
|
|
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1:%.*]], i64 11
|
|
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
|
|
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
|
|
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
|
|
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
|
|
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
|
|
; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
|
|
; AVX2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP16:%.*]] = load i32, i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP10]], i64 0
|
|
; AVX2-NEXT: [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP11]], i64 1
|
|
; AVX2-NEXT: [[TMP20:%.*]] = insertelement <8 x i32> [[TMP19]], i32 [[TMP12]], i64 2
|
|
; AVX2-NEXT: [[TMP21:%.*]] = insertelement <8 x i32> [[TMP20]], i32 [[TMP13]], i64 3
|
|
; AVX2-NEXT: [[TMP22:%.*]] = insertelement <8 x i32> [[TMP21]], i32 [[TMP14]], i64 4
|
|
; AVX2-NEXT: [[TMP23:%.*]] = insertelement <8 x i32> [[TMP22]], i32 [[TMP15]], i64 5
|
|
; AVX2-NEXT: [[TMP24:%.*]] = insertelement <8 x i32> [[TMP23]], i32 [[TMP16]], i64 6
|
|
; AVX2-NEXT: [[TMP25:%.*]] = insertelement <8 x i32> [[TMP24]], i32 [[TMP17]], i64 7
|
|
; AVX2-NEXT: [[TMP26:%.*]] = add <8 x i32> [[TMP25]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
|
|
; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP0:%.*]] to <8 x i32>*
|
|
; AVX2-NEXT: store <8 x i32> [[TMP26]], <8 x i32>* [[TMP27]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: ret void
|
|
;
|
|
; AVX512F-LABEL: @gather_load_3(
|
|
; AVX512F-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1
|
|
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
|
|
; AVX512F-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11
|
|
; AVX512F-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2
|
|
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
|
|
; AVX512F-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4
|
|
; AVX512F-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3
|
|
; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3
|
|
; AVX512F-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15
|
|
; AVX512F-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4
|
|
; AVX512F-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
|
|
; AVX512F-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 18
|
|
; AVX512F-NEXT: [[TMP19:%.*]] = load i32, i32* [[TMP18]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], 1
|
|
; AVX512F-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
|
|
; AVX512F-NEXT: store i32 [[TMP20]], i32* [[TMP17]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
|
|
; AVX512F-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP24:%.*]] = add i32 [[TMP23]], 2
|
|
; AVX512F-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
|
|
; AVX512F-NEXT: store i32 [[TMP24]], i32* [[TMP21]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
|
|
; AVX512F-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP28:%.*]] = add i32 [[TMP27]], 3
|
|
; AVX512F-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
|
|
; AVX512F-NEXT: store i32 [[TMP28]], i32* [[TMP25]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
|
|
; AVX512F-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP30]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP32:%.*]] = add i32 [[TMP31]], 4
|
|
; AVX512F-NEXT: store i32 [[TMP32]], i32* [[TMP29]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: ret void
|
|
;
|
|
; AVX512VL-LABEL: @gather_load_3(
|
|
; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1
|
|
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1
|
|
; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0
|
|
; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
|
|
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5
|
|
; AVX512VL-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], <i32 2, i32 3, i32 4, i32 1>
|
|
; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
|
|
; AVX512VL-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9
|
|
; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], 2
|
|
; AVX512VL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
|
|
; AVX512VL-NEXT: store i32 [[TMP14]], i32* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6
|
|
; AVX512VL-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 3
|
|
; AVX512VL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7
|
|
; AVX512VL-NEXT: store i32 [[TMP18]], i32* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21
|
|
; AVX512VL-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP22:%.*]] = add i32 [[TMP21]], 4
|
|
; AVX512VL-NEXT: store i32 [[TMP22]], i32* [[TMP19]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: ret void
|
|
;
|
|
%3 = load i32, i32* %1, align 4, !tbaa !2
|
|
%4 = add i32 %3, 1
|
|
%5 = getelementptr inbounds i32, i32* %0, i64 1
|
|
store i32 %4, i32* %0, align 4, !tbaa !2
|
|
%6 = getelementptr inbounds i32, i32* %1, i64 11
|
|
%7 = load i32, i32* %6, align 4, !tbaa !2
|
|
%8 = add i32 %7, 2
|
|
%9 = getelementptr inbounds i32, i32* %0, i64 2
|
|
store i32 %8, i32* %5, align 4, !tbaa !2
|
|
%10 = getelementptr inbounds i32, i32* %1, i64 4
|
|
%11 = load i32, i32* %10, align 4, !tbaa !2
|
|
%12 = add i32 %11, 3
|
|
%13 = getelementptr inbounds i32, i32* %0, i64 3
|
|
store i32 %12, i32* %9, align 4, !tbaa !2
|
|
%14 = getelementptr inbounds i32, i32* %1, i64 15
|
|
%15 = load i32, i32* %14, align 4, !tbaa !2
|
|
%16 = add i32 %15, 4
|
|
%17 = getelementptr inbounds i32, i32* %0, i64 4
|
|
store i32 %16, i32* %13, align 4, !tbaa !2
|
|
%18 = getelementptr inbounds i32, i32* %1, i64 18
|
|
%19 = load i32, i32* %18, align 4, !tbaa !2
|
|
%20 = add i32 %19, 1
|
|
%21 = getelementptr inbounds i32, i32* %0, i64 5
|
|
store i32 %20, i32* %17, align 4, !tbaa !2
|
|
%22 = getelementptr inbounds i32, i32* %1, i64 9
|
|
%23 = load i32, i32* %22, align 4, !tbaa !2
|
|
%24 = add i32 %23, 2
|
|
%25 = getelementptr inbounds i32, i32* %0, i64 6
|
|
store i32 %24, i32* %21, align 4, !tbaa !2
|
|
%26 = getelementptr inbounds i32, i32* %1, i64 6
|
|
%27 = load i32, i32* %26, align 4, !tbaa !2
|
|
%28 = add i32 %27, 3
|
|
%29 = getelementptr inbounds i32, i32* %0, i64 7
|
|
store i32 %28, i32* %25, align 4, !tbaa !2
|
|
%30 = getelementptr inbounds i32, i32* %1, i64 21
|
|
%31 = load i32, i32* %30, align 4, !tbaa !2
|
|
%32 = add i32 %31, 4
|
|
store i32 %32, i32* %29, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture readonly %t1) {
|
|
; SSE-LABEL: @gather_load_4(
|
|
; SSE-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
|
; SSE-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
|
|
; SSE-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
|
|
; SSE-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
|
|
; SSE-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
|
|
; SSE-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
|
|
; SSE-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
|
|
; SSE-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
|
|
; SSE-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
|
; SSE-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
|
; SSE-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
|
; SSE-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
|
; SSE-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
|
; SSE-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
|
; SSE-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
|
; SSE-NEXT: [[T8:%.*]] = add i32 [[T7]], 2
|
|
; SSE-NEXT: [[T12:%.*]] = add i32 [[T11]], 3
|
|
; SSE-NEXT: [[T16:%.*]] = add i32 [[T15]], 4
|
|
; SSE-NEXT: [[T20:%.*]] = add i32 [[T19]], 1
|
|
; SSE-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
|
; SSE-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
|
; SSE-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
|
; SSE-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @gather_load_4(
|
|
; AVX-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
|
|
; AVX-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
|
|
; AVX-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
|
|
; AVX-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
|
|
; AVX-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
|
; AVX-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
|
; AVX-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
|
; AVX-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
|
|
; AVX-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
|
|
; AVX-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
|
|
; AVX-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
|
|
; AVX-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
|
|
; AVX-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
|
|
; AVX-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
|
|
; AVX-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
|
|
; AVX-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
|
|
; AVX-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
|
|
; AVX-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: ret void
|
|
;
|
|
; AVX2-LABEL: @gather_load_4(
|
|
; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
|
|
; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
|
|
; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
|
|
; AVX2-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
|
|
; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
|
; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
|
; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
|
; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[T3]], i64 0
|
|
; AVX2-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[T7]], i64 1
|
|
; AVX2-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[T11]], i64 2
|
|
; AVX2-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[T15]], i64 3
|
|
; AVX2-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[T19]], i64 4
|
|
; AVX2-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[T23]], i64 5
|
|
; AVX2-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T27]], i64 6
|
|
; AVX2-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[T31]], i64 7
|
|
; AVX2-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], <i32 1, i32 2, i32 3, i32 4, i32 1, i32 2, i32 3, i32 4>
|
|
; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[T0:%.*]] to <8 x i32>*
|
|
; AVX2-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: ret void
|
|
;
|
|
; AVX512F-LABEL: @gather_load_4(
|
|
; AVX512F-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
|
; AVX512F-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11
|
|
; AVX512F-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2
|
|
; AVX512F-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4
|
|
; AVX512F-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3
|
|
; AVX512F-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15
|
|
; AVX512F-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4
|
|
; AVX512F-NEXT: [[T18:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 18
|
|
; AVX512F-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
|
; AVX512F-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
|
; AVX512F-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
|
; AVX512F-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
|
; AVX512F-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
|
; AVX512F-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
|
; AVX512F-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T19:%.*]] = load i32, i32* [[T18]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
|
; AVX512F-NEXT: [[T8:%.*]] = add i32 [[T7]], 2
|
|
; AVX512F-NEXT: [[T12:%.*]] = add i32 [[T11]], 3
|
|
; AVX512F-NEXT: [[T16:%.*]] = add i32 [[T15]], 4
|
|
; AVX512F-NEXT: [[T20:%.*]] = add i32 [[T19]], 1
|
|
; AVX512F-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
|
; AVX512F-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
|
; AVX512F-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
|
; AVX512F-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T8]], i32* [[T5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T12]], i32* [[T9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T16]], i32* [[T13]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T20]], i32* [[T17]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: ret void
|
|
;
|
|
; AVX512VL-LABEL: @gather_load_4(
|
|
; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1
|
|
; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0
|
|
; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> <i64 11, i64 4, i64 15, i64 18>
|
|
; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5
|
|
; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9
|
|
; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6
|
|
; AVX512VL-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6
|
|
; AVX512VL-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7
|
|
; AVX512VL-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21
|
|
; AVX512VL-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[T4:%.*]] = add i32 [[T3]], 1
|
|
; AVX512VL-NEXT: [[T24:%.*]] = add i32 [[T23]], 2
|
|
; AVX512VL-NEXT: [[T28:%.*]] = add i32 [[T27]], 3
|
|
; AVX512VL-NEXT: [[T32:%.*]] = add i32 [[T31]], 4
|
|
; AVX512VL-NEXT: store i32 [[T4]], i32* [[T0]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef), !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], <i32 2, i32 3, i32 4, i32 1>
|
|
; AVX512VL-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <4 x i32>*
|
|
; AVX512VL-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: store i32 [[T24]], i32* [[T21]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: store i32 [[T28]], i32* [[T25]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: store i32 [[T32]], i32* [[T29]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: ret void
|
|
;
|
|
%t5 = getelementptr inbounds i32, i32* %t0, i64 1
|
|
%t6 = getelementptr inbounds i32, i32* %t1, i64 11
|
|
%t9 = getelementptr inbounds i32, i32* %t0, i64 2
|
|
%t10 = getelementptr inbounds i32, i32* %t1, i64 4
|
|
%t13 = getelementptr inbounds i32, i32* %t0, i64 3
|
|
%t14 = getelementptr inbounds i32, i32* %t1, i64 15
|
|
%t17 = getelementptr inbounds i32, i32* %t0, i64 4
|
|
%t18 = getelementptr inbounds i32, i32* %t1, i64 18
|
|
%t21 = getelementptr inbounds i32, i32* %t0, i64 5
|
|
%t22 = getelementptr inbounds i32, i32* %t1, i64 9
|
|
%t25 = getelementptr inbounds i32, i32* %t0, i64 6
|
|
%t26 = getelementptr inbounds i32, i32* %t1, i64 6
|
|
%t29 = getelementptr inbounds i32, i32* %t0, i64 7
|
|
%t30 = getelementptr inbounds i32, i32* %t1, i64 21
|
|
|
|
%t3 = load i32, i32* %t1, align 4, !tbaa !2
|
|
%t7 = load i32, i32* %t6, align 4, !tbaa !2
|
|
%t11 = load i32, i32* %t10, align 4, !tbaa !2
|
|
%t15 = load i32, i32* %t14, align 4, !tbaa !2
|
|
%t19 = load i32, i32* %t18, align 4, !tbaa !2
|
|
%t23 = load i32, i32* %t22, align 4, !tbaa !2
|
|
%t27 = load i32, i32* %t26, align 4, !tbaa !2
|
|
%t31 = load i32, i32* %t30, align 4, !tbaa !2
|
|
|
|
%t4 = add i32 %t3, 1
|
|
%t8 = add i32 %t7, 2
|
|
%t12 = add i32 %t11, 3
|
|
%t16 = add i32 %t15, 4
|
|
%t20 = add i32 %t19, 1
|
|
%t24 = add i32 %t23, 2
|
|
%t28 = add i32 %t27, 3
|
|
%t32 = add i32 %t31, 4
|
|
|
|
store i32 %t4, i32* %t0, align 4, !tbaa !2
|
|
store i32 %t8, i32* %t5, align 4, !tbaa !2
|
|
store i32 %t12, i32* %t9, align 4, !tbaa !2
|
|
store i32 %t16, i32* %t13, align 4, !tbaa !2
|
|
store i32 %t20, i32* %t17, align 4, !tbaa !2
|
|
store i32 %t24, i32* %t21, align 4, !tbaa !2
|
|
store i32 %t28, i32* %t25, align 4, !tbaa !2
|
|
store i32 %t32, i32* %t29, align 4, !tbaa !2
|
|
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @gather_load_div(float* noalias nocapture %0, float* noalias nocapture readonly %1) {
|
|
; SSE-LABEL: @gather_load_div(
|
|
; SSE-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
|
|
; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
|
|
; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
|
|
; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
|
|
; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
|
|
; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
|
|
; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
|
|
; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP0:%.*]], i64 4
|
|
; SSE-NEXT: [[TMP11:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP12:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP13:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP14:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP15:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP16:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP17:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP18:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP11]], i64 0
|
|
; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP13]], i64 1
|
|
; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP15]], i64 2
|
|
; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP17]], i64 3
|
|
; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP12]], i64 0
|
|
; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP14]], i64 1
|
|
; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP16]], i64 2
|
|
; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP18]], i64 3
|
|
; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
|
|
; SSE-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
|
|
; SSE-NEXT: store <4 x float> [[TMP27]], <4 x float>* [[TMP28]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
|
|
; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
|
|
; SSE-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
|
|
; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
|
|
; SSE-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
|
|
; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
|
|
; SSE-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
|
|
; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
|
|
; SSE-NEXT: [[TMP37:%.*]] = load float, float* [[TMP29]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP38:%.*]] = load float, float* [[TMP30]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP39:%.*]] = load float, float* [[TMP31]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP40:%.*]] = load float, float* [[TMP32]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP41:%.*]] = load float, float* [[TMP33]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP42:%.*]] = load float, float* [[TMP34]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP43:%.*]] = load float, float* [[TMP35]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP44:%.*]] = load float, float* [[TMP36]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> poison, float [[TMP37]], i64 0
|
|
; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP39]], i64 1
|
|
; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 2
|
|
; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> [[TMP47]], float [[TMP43]], i64 3
|
|
; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> poison, float [[TMP38]], i64 0
|
|
; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP40]], i64 1
|
|
; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP42]], i64 2
|
|
; SSE-NEXT: [[TMP52:%.*]] = insertelement <4 x float> [[TMP51]], float [[TMP44]], i64 3
|
|
; SSE-NEXT: [[TMP53:%.*]] = fdiv <4 x float> [[TMP48]], [[TMP52]]
|
|
; SSE-NEXT: [[TMP54:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
|
|
; SSE-NEXT: store <4 x float> [[TMP53]], <4 x float>* [[TMP54]], align 4, !tbaa [[TBAA0]]
|
|
; SSE-NEXT: ret void
|
|
;
|
|
; AVX-LABEL: @gather_load_div(
|
|
; AVX-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
|
|
; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
|
|
; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
|
|
; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
|
|
; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
|
|
; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
|
|
; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
|
|
; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
|
|
; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
|
|
; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
|
|
; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
|
|
; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
|
|
; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
|
|
; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
|
|
; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
|
|
; AVX-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
|
|
; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
|
|
; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
|
|
; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
|
|
; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
|
|
; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
|
|
; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
|
|
; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
|
|
; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
|
|
; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
|
|
; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
|
|
; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
|
|
; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
|
|
; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
|
|
; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
|
|
; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
|
|
; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
|
|
; AVX-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
|
|
; AVX-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]]
|
|
; AVX-NEXT: ret void
|
|
;
|
|
; AVX2-LABEL: @gather_load_div(
|
|
; AVX2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 4
|
|
; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 10
|
|
; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 13
|
|
; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 3
|
|
; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 11
|
|
; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 14
|
|
; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 44
|
|
; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 17
|
|
; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 33
|
|
; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 8
|
|
; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 30
|
|
; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 5
|
|
; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 27
|
|
; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
|
|
; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 23
|
|
; AVX2-NEXT: [[TMP18:%.*]] = load float, float* [[TMP1]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP19:%.*]] = load float, float* [[TMP3]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP20:%.*]] = load float, float* [[TMP4]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP21:%.*]] = load float, float* [[TMP5]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP22:%.*]] = load float, float* [[TMP6]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP23:%.*]] = load float, float* [[TMP7]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP24:%.*]] = load float, float* [[TMP8]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP25:%.*]] = load float, float* [[TMP9]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP26:%.*]] = load float, float* [[TMP10]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP27:%.*]] = load float, float* [[TMP11]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP28:%.*]] = load float, float* [[TMP12]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP29:%.*]] = load float, float* [[TMP13]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP30:%.*]] = load float, float* [[TMP14]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP31:%.*]] = load float, float* [[TMP15]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP32:%.*]] = load float, float* [[TMP16]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP33:%.*]] = load float, float* [[TMP17]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP18]], i64 0
|
|
; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP20]], i64 1
|
|
; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP22]], i64 2
|
|
; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP24]], i64 3
|
|
; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP26]], i64 4
|
|
; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP28]], i64 5
|
|
; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP30]], i64 6
|
|
; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP32]], i64 7
|
|
; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP19]], i64 0
|
|
; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP21]], i64 1
|
|
; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP23]], i64 2
|
|
; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP25]], i64 3
|
|
; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP27]], i64 4
|
|
; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP29]], i64 5
|
|
; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP31]], i64 6
|
|
; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7
|
|
; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]]
|
|
; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
|
|
; AVX2-NEXT: store <8 x float> [[TMP50]], <8 x float>* [[TMP51]], align 4, !tbaa [[TBAA0]]
|
|
; AVX2-NEXT: ret void
|
|
;
|
|
; AVX512F-LABEL: @gather_load_div(
|
|
; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0
|
|
; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer
|
|
; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
|
|
; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
|
|
; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer
|
|
; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
|
|
; AVX512F-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
|
|
; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer
|
|
; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> <i64 8, i64 5>
|
|
; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
|
|
; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
|
|
; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
|
|
; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
|
|
; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
|
|
; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
|
|
; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512F-NEXT: ret void
|
|
;
|
|
; AVX512VL-LABEL: @gather_load_div(
|
|
; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <8 x float*> poison, float* [[TMP1:%.*]], i64 0
|
|
; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> poison, <8 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
|
|
; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0
|
|
; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP5]], <4 x float*> poison, <4 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> <i64 10, i64 3, i64 14, i64 17>
|
|
; AVX512VL-NEXT: [[TMP7:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0
|
|
; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <2 x i32> zeroinitializer
|
|
; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <2 x float*> [[TMP8]], <2 x i64> <i64 8, i64 5>
|
|
; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20
|
|
; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP3]], <8 x float*> [[TMP11]], <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef>
|
|
; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP9]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 8, i32 9, i32 undef>
|
|
; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7
|
|
; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP4]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]]
|
|
; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
|
|
; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512VL-NEXT: ret void
|
|
;
|
|
; AVX512-LABEL: @gather_load_div(
|
|
; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1:%.*]], i64 10
|
|
; AVX512-NEXT: [[TMP4:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i32 0
|
|
; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <2 x float*> [[TMP4]], <2 x float*> poison, <2 x i32> zeroinitializer
|
|
; AVX512-NEXT: [[TMP6:%.*]] = getelementptr float, <2 x float*> [[TMP5]], <2 x i64> <i64 3, i64 14>
|
|
; AVX512-NEXT: [[TMP7:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i32 0
|
|
; AVX512-NEXT: [[TMP8:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <4 x i32> zeroinitializer
|
|
; AVX512-NEXT: [[TMP9:%.*]] = getelementptr float, <4 x float*> [[TMP8]], <4 x i64> <i64 17, i64 8, i64 5, i64 20>
|
|
; AVX512-NEXT: [[TMP10:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i32 0
|
|
; AVX512-NEXT: [[TMP11:%.*]] = insertelement <8 x float*> [[TMP10]], float* [[TMP3]], i32 1
|
|
; AVX512-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP6]], <2 x float*> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512-NEXT: [[TMP14:%.*]] = shufflevector <4 x float*> [[TMP9]], <4 x float*> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
|
; AVX512-NEXT: [[TMP15:%.*]] = shufflevector <8 x float*> [[TMP13]], <8 x float*> [[TMP14]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
|
|
; AVX512-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512-NEXT: [[TMP17:%.*]] = shufflevector <8 x float*> [[TMP10]], <8 x float*> poison, <8 x i32> zeroinitializer
|
|
; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, <8 x float*> [[TMP17]], <8 x i64> <i64 4, i64 13, i64 11, i64 44, i64 33, i64 30, i64 27, i64 23>
|
|
; AVX512-NEXT: [[TMP19:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP18]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef), !tbaa [[TBAA0]]
|
|
; AVX512-NEXT: [[TMP20:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP19]]
|
|
; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>*
|
|
; AVX512-NEXT: store <8 x float> [[TMP20]], <8 x float>* [[TMP21]], align 4, !tbaa [[TBAA0]]
|
|
; AVX512-NEXT: ret void
|
|
%3 = load float, float* %1, align 4, !tbaa !2
|
|
%4 = getelementptr inbounds float, float* %1, i64 4
|
|
%5 = load float, float* %4, align 4, !tbaa !2
|
|
%6 = fdiv float %3, %5
|
|
%7 = getelementptr inbounds float, float* %0, i64 1
|
|
store float %6, float* %0, align 4, !tbaa !2
|
|
%8 = getelementptr inbounds float, float* %1, i64 10
|
|
%9 = load float, float* %8, align 4, !tbaa !2
|
|
%10 = getelementptr inbounds float, float* %1, i64 13
|
|
%11 = load float, float* %10, align 4, !tbaa !2
|
|
%12 = fdiv float %9, %11
|
|
%13 = getelementptr inbounds float, float* %0, i64 2
|
|
store float %12, float* %7, align 4, !tbaa !2
|
|
%14 = getelementptr inbounds float, float* %1, i64 3
|
|
%15 = load float, float* %14, align 4, !tbaa !2
|
|
%16 = getelementptr inbounds float, float* %1, i64 11
|
|
%17 = load float, float* %16, align 4, !tbaa !2
|
|
%18 = fdiv float %15, %17
|
|
%19 = getelementptr inbounds float, float* %0, i64 3
|
|
store float %18, float* %13, align 4, !tbaa !2
|
|
%20 = getelementptr inbounds float, float* %1, i64 14
|
|
%21 = load float, float* %20, align 4, !tbaa !2
|
|
%22 = getelementptr inbounds float, float* %1, i64 44
|
|
%23 = load float, float* %22, align 4, !tbaa !2
|
|
%24 = fdiv float %21, %23
|
|
%25 = getelementptr inbounds float, float* %0, i64 4
|
|
store float %24, float* %19, align 4, !tbaa !2
|
|
%26 = getelementptr inbounds float, float* %1, i64 17
|
|
%27 = load float, float* %26, align 4, !tbaa !2
|
|
%28 = getelementptr inbounds float, float* %1, i64 33
|
|
%29 = load float, float* %28, align 4, !tbaa !2
|
|
%30 = fdiv float %27, %29
|
|
%31 = getelementptr inbounds float, float* %0, i64 5
|
|
store float %30, float* %25, align 4, !tbaa !2
|
|
%32 = getelementptr inbounds float, float* %1, i64 8
|
|
%33 = load float, float* %32, align 4, !tbaa !2
|
|
%34 = getelementptr inbounds float, float* %1, i64 30
|
|
%35 = load float, float* %34, align 4, !tbaa !2
|
|
%36 = fdiv float %33, %35
|
|
%37 = getelementptr inbounds float, float* %0, i64 6
|
|
store float %36, float* %31, align 4, !tbaa !2
|
|
%38 = getelementptr inbounds float, float* %1, i64 5
|
|
%39 = load float, float* %38, align 4, !tbaa !2
|
|
%40 = getelementptr inbounds float, float* %1, i64 27
|
|
%41 = load float, float* %40, align 4, !tbaa !2
|
|
%42 = fdiv float %39, %41
|
|
%43 = getelementptr inbounds float, float* %0, i64 7
|
|
store float %42, float* %37, align 4, !tbaa !2
|
|
%44 = getelementptr inbounds float, float* %1, i64 20
|
|
%45 = load float, float* %44, align 4, !tbaa !2
|
|
%46 = getelementptr inbounds float, float* %1, i64 23
|
|
%47 = load float, float* %46, align 4, !tbaa !2
|
|
%48 = fdiv float %45, %47
|
|
store float %48, float* %43, align 4, !tbaa !2
|
|
ret void
|
|
}
|
|
|
|
!2 = !{!3, !3, i64 0}
|
|
!3 = !{!"short", !4, i64 0}
|
|
!4 = !{!"omnipotent char", !5, i64 0}
|
|
!5 = !{!"Simple C++ TBAA"}
|