Philip Reames 7d6e8f2a96 [slp] Delete dead scalar instructions feeding vectorized instructions
If we vectorize a e.g. store, we leave around a bunch of getelementptrs for the individual scalar stores which we removed. We can go ahead and delete them as well.

This is purely for test output quality and readability. It should have no effect in any sane pipeline.

Differential Revision: https://reviews.llvm.org/D122493
2022-03-28 20:10:13 -07:00

123 lines
7.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S -mtriple=aarch64-unknown-unknown -mcpu=cortex-a53 | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
; This test is reduced from the TSVC evaluation of vectorizers:
; https://github.com/llvm/llvm-test-suite/commits/main/MultiSource/Benchmarks/TSVC/LoopRerolling-flt/tsc.c
; Two loads and an fmul are expected to be vectorized to <2 x float>.
; Otherwise, performance will suffer on Cortex-A53.
; See https://bugs.llvm.org/show_bug.cgi?id=36280 for more details.
%struct.GlobalData = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float], [5 x i32], [12 x i8], [32000 x float], [7 x i32], [4 x i8], [32000 x float], [11 x i32], [4 x i8], [32000 x float], [13 x i32], [12 x i8], [256 x [256 x float]], [17 x i32], [12 x i8], [256 x [256 x float]], [19 x i32], [4 x i8], [256 x [256 x float]], [23 x i32], [4 x i8], [256 x [256 x float]] }
@global_data = common dso_local global %struct.GlobalData zeroinitializer, align 16
define i32 @s352() {
; CHECK-LABEL: @s352(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[PREHEADER:%.*]]
; CHECK: preheader:
; CHECK-NEXT: [[NL_017:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret i32 0
; CHECK: for.cond.cleanup3:
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[NL_017]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1600000
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[PREHEADER]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[DOT_115:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[ADD39:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDVARS_IV]]
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <2 x float>*
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX6]] to <2 x float>*
; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x float> [[TMP1]], [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
; CHECK-NEXT: [[ADD:%.*]] = fadd float [[DOT_115]], [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
; CHECK-NEXT: [[ADD15:%.*]] = fadd float [[ADD]], [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP7]]
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[ARRAYIDX18]] to <2 x float>*
; CHECK-NEXT: [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[TMP8]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX21]] to <2 x float>*
; CHECK-NEXT: [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[TMP10]], align 4
; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x float> [[TMP9]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP12]], i32 0
; CHECK-NEXT: [[ADD23:%.*]] = fadd float [[ADD15]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP12]], i32 1
; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD23]], [[TMP14]]
; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP15]]
; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX34]], align 4
; CHECK-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX37]], align 4
; CHECK-NEXT: [[MUL38:%.*]] = fmul float [[TMP16]], [[TMP17]]
; CHECK-NEXT: [[ADD39]] = fadd float [[ADD31]], [[MUL38]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 32000
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP3]]
;
entry:
br label %preheader
preheader:
%nl.017 = phi i32 [ 0, %entry ], [ %inc, %for.cond.cleanup3 ]
br label %for.body
for.cond.cleanup:
ret i32 0
for.cond.cleanup3:
%inc = add nuw nsw i32 %nl.017, 1
%exitcond = icmp eq i32 %inc, 1600000
br i1 %exitcond, label %for.cond.cleanup, label %preheader
for.body:
%indvars.iv = phi i64 [ 0, %preheader ], [ %indvars.iv.next, %for.body ]
%dot.115 = phi float [ 0.000000e+00, %preheader ], [ %add39, %for.body ]
%arrayidx = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %indvars.iv
%0 = load float, float* %arrayidx, align 4
%arrayidx6 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %indvars.iv
%1 = load float, float* %arrayidx6, align 4
%mul7 = fmul float %0, %1
%add = fadd float %dot.115, %mul7
%2 = add nuw nsw i64 %indvars.iv, 1
%arrayidx10 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %2
%3 = load float, float* %arrayidx10, align 4
%arrayidx13 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %2
%4 = load float, float* %arrayidx13, align 4
%mul14 = fmul float %3, %4
%add15 = fadd float %add, %mul14
%5 = add nuw nsw i64 %indvars.iv, 2
%arrayidx18 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %5
%6 = load float, float* %arrayidx18, align 4
%arrayidx21 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %5
%7 = load float, float* %arrayidx21, align 4
%mul22 = fmul float %6, %7
%add23 = fadd float %add15, %mul22
%8 = add nuw nsw i64 %indvars.iv, 3
%arrayidx26 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %8
%9 = load float, float* %arrayidx26, align 4
%arrayidx29 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %8
%10 = load float, float* %arrayidx29, align 4
%mul30 = fmul float %9, %10
%add31 = fadd float %add23, %mul30
%11 = add nuw nsw i64 %indvars.iv, 4
%arrayidx34 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 0, i64 %11
%12 = load float, float* %arrayidx34, align 4
%arrayidx37 = getelementptr inbounds %struct.GlobalData, %struct.GlobalData* @global_data, i64 0, i32 3, i64 %11
%13 = load float, float* %arrayidx37, align 4
%mul38 = fmul float %12, %13
%add39 = fadd float %add31, %mul38
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
%cmp2 = icmp ult i64 %indvars.iv.next, 32000
br i1 %cmp2, label %for.body, label %for.cond.cleanup3
}