This reverts commit 77a0da926c9ea86afa9baf28158d79c7678fc6b9 as we've received multiple reports of this significantly impacting performance, in ways that don't seem to just be target specific cost models going wrong. I would offer some reproducers, but the test changes here seem to be full of them! Reverting for now and hopefully we can remove the "hack" more carefully as we go.
109 lines
5.3 KiB
LLVM
109 lines
5.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
|
|
|
; Vectorization with dependence checks.
|
|
|
|
; Check that a non-power-of-2 MaxVF, calculated based on maximum safe distance,
|
|
; does not lead fold-tail to think that no tail will be generated for any chosen
|
|
; (power of 2) VF.
|
|
; Dependence distance here is 3 iterations.
|
|
; Tiny trip count of 15 divides 3, but any (even) VF will have a tail.
|
|
|
|
;unsigned char a [15+3];
|
|
;void maxvf3(){
|
|
; for (int j = 0; j < 15; ++j) {
|
|
; a[j] = 69;
|
|
; a[j+3] = 7;
|
|
; }
|
|
;}
|
|
|
|
@a = common local_unnamed_addr global [18 x i8] zeroinitializer, align 16
|
|
|
|
define void @maxvf3() {
|
|
; CHECK-LABEL: @maxvf3(
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
|
|
; CHECK: vector.ph:
|
|
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
|
; CHECK: vector.body:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
|
|
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
|
|
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 14, i32 14>
|
|
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
|
|
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
|
|
; CHECK: pred.store.if:
|
|
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP2]]
|
|
; CHECK-NEXT: store i8 69, i8* [[TMP3]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
|
|
; CHECK: pred.store.continue:
|
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
|
|
; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
|
|
; CHECK: pred.store.if1:
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
|
|
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP5]]
|
|
; CHECK-NEXT: store i8 69, i8* [[TMP6]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
|
|
; CHECK: pred.store.continue2:
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <2 x i32> <i32 3, i32 3>, [[VEC_IND]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
|
|
; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
|
|
; CHECK: pred.store.if3:
|
|
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
|
|
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP9]]
|
|
; CHECK-NEXT: store i8 7, i8* [[TMP10]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
|
|
; CHECK: pred.store.continue4:
|
|
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
|
|
; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
|
|
; CHECK: pred.store.if5:
|
|
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
|
|
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP12]]
|
|
; CHECK-NEXT: store i8 7, i8* [[TMP13]], align 8
|
|
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
|
|
; CHECK: pred.store.continue6:
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
|
|
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
|
|
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
|
|
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
|
|
; CHECK: middle.block:
|
|
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
|
|
; CHECK: scalar.ph:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
|
|
; CHECK: for.body:
|
|
; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[J]]
|
|
; CHECK-NEXT: store i8 69, i8* [[AJ]], align 8
|
|
; CHECK-NEXT: [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
|
|
; CHECK-NEXT: [[AJP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[JP3]]
|
|
; CHECK-NEXT: store i8 7, i8* [[AJP3]], align 8
|
|
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
|
|
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
|
|
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
|
|
; CHECK: for.end:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
|
|
%aj = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %j
|
|
store i8 69, i8* %aj, align 8
|
|
%jp3 = add nuw nsw i32 3, %j
|
|
%ajp3 = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %jp3
|
|
store i8 7, i8* %ajp3, align 8
|
|
%j.next = add nuw nsw i32 %j, 1
|
|
%exitcond = icmp eq i32 %j.next, 15
|
|
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
|
|
|
|
for.end:
|
|
ret void
|
|
}
|
|
|
|
!0 = distinct !{!0, !1}
|
|
!1 = !{!"llvm.loop.vectorize.enable", i1 true}
|