If we vectorize a e.g. store, we leave around a bunch of getelementptrs for the individual scalar stores which we removed. We can go ahead and delete them as well. This is purely for test output quality and readability. It should have no effect in any sane pipeline. Differential Revision: https://reviews.llvm.org/D122493
123 lines
4.3 KiB
LLVM
123 lines
4.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -basic-aa -slp-vectorizer -S -slp-schedule-budget=16 -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
|
|
|
|
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-apple-macosx10.9.0"
|
|
|
|
; Test if the budget for the scheduling region size works.
|
|
; We test with a reduced budget of 16 which should prevent vectorizing the loads.
|
|
|
|
declare void @unknown()
|
|
|
|
define void @test(float * %a, float * %b, float * %c, float * %d) {
|
|
; CHECK-LABEL: @test(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <4 x float>*
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: call void @unknown()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B:%.*]] to <4 x float>*
|
|
; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4
|
|
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[C:%.*]] to <4 x float>*
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[D:%.*]] to <4 x float>*
|
|
; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
; Don't vectorize these loads.
|
|
%l0 = load float, float* %a
|
|
%a1 = getelementptr inbounds float, float* %a, i64 1
|
|
%l1 = load float, float* %a1
|
|
%a2 = getelementptr inbounds float, float* %a, i64 2
|
|
%l2 = load float, float* %a2
|
|
%a3 = getelementptr inbounds float, float* %a, i64 3
|
|
%l3 = load float, float* %a3
|
|
|
|
; some unrelated instructions inbetween to enlarge the scheduling region
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
call void @unknown()
|
|
|
|
; Don't vectorize these stores because their operands are too far away.
|
|
store float %l0, float* %b
|
|
%b1 = getelementptr inbounds float, float* %b, i64 1
|
|
store float %l1, float* %b1
|
|
%b2 = getelementptr inbounds float, float* %b, i64 2
|
|
store float %l2, float* %b2
|
|
%b3 = getelementptr inbounds float, float* %b, i64 3
|
|
store float %l3, float* %b3
|
|
|
|
; But still vectorize the following instructions, because even if the budget
|
|
; is exceeded there is a minimum region size.
|
|
%l4 = load float, float* %c
|
|
%c1 = getelementptr inbounds float, float* %c, i64 1
|
|
%l5 = load float, float* %c1
|
|
%c2 = getelementptr inbounds float, float* %c, i64 2
|
|
%l6 = load float, float* %c2
|
|
%c3 = getelementptr inbounds float, float* %c, i64 3
|
|
%l7 = load float, float* %c3
|
|
|
|
store float %l4, float* %d
|
|
%d1 = getelementptr inbounds float, float* %d, i64 1
|
|
store float %l5, float* %d1
|
|
%d2 = getelementptr inbounds float, float* %d, i64 2
|
|
store float %l6, float* %d2
|
|
%d3 = getelementptr inbounds float, float* %d, i64 3
|
|
store float %l7, float* %d3
|
|
|
|
ret void
|
|
}
|
|
|