This enables vectorization of epilogue loops produced by LoopVectorizer on SystemZ. LoopVectorizationCostModel::isEpilogueVectorizationProfitable() and TTI.preferEpilogueVectorization() have been refactored slightly so that targets can override preferEpilogueVectorization(ElementCount Iters) and directly control this, whereas before this depended on TTI.getMaxInterleaveFactor() as well. The Iters passed to preferEpilogueVectorization() reflects the total number of scalar iterations performed in the vectorized loop (including interleaving). The default implementation of preferEpilogueVectorization() now subsumes the old check against getMaxInterleaveFactor(). This patch should be NFC for other targets.
85 lines
4.9 KiB
LLVM
85 lines
4.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
|
|
; RUN: opt -S -mtriple=s390x-unknown-linux -mcpu=z16 -passes=loop-vectorize < %s \
|
|
; RUN: | FileCheck %s
|
|
;
|
|
; Test that loop vectorizer generates a vectorized epilogue loop after
|
|
; vectorizing the main loop with VF = 16.
|
|
|
|
define void @fun(ptr noalias %Src, ptr noalias %Dst, i64 %N) {
|
|
; CHECK-LABEL: define void @fun(
|
|
; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: [[ITER_CHECK:.*]]:
|
|
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
|
|
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
|
|
; CHECK-NEXT: [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[TMP0]], 16
|
|
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK3]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
|
|
; CHECK: [[VECTOR_PH]]:
|
|
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
|
|
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
|
|
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CHECK: [[VECTOR_BODY]]:
|
|
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX]]
|
|
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
|
|
; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], ptr [[TMP3]], align 1
|
|
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
|
|
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CHECK: [[MIDDLE_BLOCK]]:
|
|
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
|
|
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
|
|
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
|
|
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
|
|
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
|
|
; CHECK: [[VEC_EPILOG_PH]]:
|
|
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
|
|
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 4
|
|
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
|
|
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
|
|
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
|
|
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDEX6]]
|
|
; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
|
|
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]]
|
|
; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD7]], ptr [[TMP6]], align 1
|
|
; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX6]], 4
|
|
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC5]]
|
|
; CHECK-NEXT: br i1 [[TMP7]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
|
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
|
|
; CHECK-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
|
|
; CHECK-NEXT: br i1 [[CMP_N9]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
|
|
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
|
|
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
|
|
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
|
|
; CHECK: [[FOR_BODY]]:
|
|
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
|
|
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
|
|
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX0]], align 1
|
|
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDVARS_IV]]
|
|
; CHECK-NEXT: store i8 [[TMP8]], ptr [[ARRAYIDX1]], align 1
|
|
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[N]]
|
|
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
|
|
; CHECK: [[EXIT]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body:
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
|
|
%iv.next = add i64 %iv, 1
|
|
%arrayidx0 = getelementptr i8, ptr %Src, i64 %iv
|
|
%0 = load i8, ptr %arrayidx0
|
|
%arrayidx1 = getelementptr i8, ptr %Dst, i64 %iv
|
|
store i8 %0, ptr %arrayidx1
|
|
%exitcond.not = icmp eq i64 %iv, %N
|
|
br i1 %exitcond.not, label %exit, label %for.body
|
|
|
|
exit:
|
|
ret void
|
|
}
|