David Green b55d4c2ad8 Revert "[LV] Remove LoopVectorizationCostModel::useEmulatedMaskMemRefHack()"
This reverts commit 77a0da926c9ea86afa9baf28158d79c7678fc6b9 as we've
received multiple reports of this significantly impacting performance,
in ways that don't seem to just be target specific cost models going
wrong. I would offer some reproducers, but the test changes here seem to
be full of them!

Reverting for now and hopefully we can remove the "hack" more carefully
as we go.
2022-02-09 20:02:54 +00:00

109 lines
5.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -vectorize-num-stores-pred=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; Vectorization with dependence checks.
; Check that a non-power-of-2 MaxVF, calculated based on maximum safe distance,
; does not lead fold-tail to think that no tail will be generated for any chosen
; (power of 2) VF.
; Dependence distance here is 3 iterations.
; Tiny trip count of 15 divides 3, but any (even) VF will have a tail.
;unsigned char a [15+3];
;void maxvf3(){
; for (int j = 0; j < 15; ++j) {
; a[j] = 69;
; a[j+3] = 7;
; }
;}
@a = common local_unnamed_addr global [18 x i8] zeroinitializer, align 16
define void @maxvf3() {
; CHECK-LABEL: @maxvf3(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ]
; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 14, i32 14>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP2]]
; CHECK-NEXT: store i8 69, i8* [[TMP3]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
; CHECK: pred.store.if1:
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP5]]
; CHECK-NEXT: store i8 69, i8* [[TMP6]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]]
; CHECK: pred.store.continue2:
; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <2 x i32> <i32 3, i32 3>, [[VEC_IND]]
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
; CHECK: pred.store.if3:
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP9]]
; CHECK-NEXT: store i8 7, i8* [[TMP10]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
; CHECK: pred.store.continue4:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.if5:
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[TMP12]]
; CHECK-NEXT: store i8 7, i8* [[TMP13]], align 8
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
; CHECK: pred.store.continue6:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[AJ:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[J]]
; CHECK-NEXT: store i8 69, i8* [[AJ]], align 8
; CHECK-NEXT: [[JP3:%.*]] = add nuw nsw i32 3, [[J]]
; CHECK-NEXT: [[AJP3:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 [[JP3]]
; CHECK-NEXT: store i8 7, i8* [[AJP3]], align 8
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
for.body:
%j = phi i32 [ 0, %entry ], [ %j.next, %for.body ]
%aj = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %j
store i8 69, i8* %aj, align 8
%jp3 = add nuw nsw i32 3, %j
%ajp3 = getelementptr inbounds [18 x i8], [18 x i8]* @a, i32 0, i32 %jp3
store i8 7, i8* %ajp3, align 8
%j.next = add nuw nsw i32 %j, 1
%exitcond = icmp eq i32 %j.next, 15
br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
for.end:
ret void
}
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.enable", i1 true}