John Brawn 9a9b8b7d1c
[AArch64] Allow unrolling of scalar epilogue loops (#151164)
#147420 changed the unrolling preferences to permit unrolling of
non-auto vectorized loops by checking for the isvectorized attribute,
however when a loop is vectorized this attribute is put on both the
vector loop and the scalar epilogue, so this change prevented the scalar
epilogue from being unrolled.

Restore the previous behaviour of unrolling the scalar epilogue by
checking both for the isvectorized attribute and vector instructions in
the loop.
2025-07-31 11:03:41 +01:00

691 lines
48 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
; APPLE: [[FOR_BODY]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
; APPLE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; APPLE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; APPLE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; APPLE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; APPLE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; APPLE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; APPLE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
; APPLE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; APPLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
; APPLE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; APPLE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
; APPLE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
; APPLE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; APPLE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
; APPLE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
; APPLE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
; APPLE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
; APPLE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
; APPLE: [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; APPLE: [[FOR_BODY_EPIL]]:
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
; APPLE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; APPLE: [[EXIT_EPILOG_LCSSA]]:
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @reverse(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
; CORTEXA55-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 3
; CORTEXA55-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
; CORTEXA55-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; CORTEXA55: [[ENTRY_NEW]]:
; CORTEXA55-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
; CORTEXA55: [[FOR_BODY]]:
; CORTEXA55-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
; CORTEXA55-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
; CORTEXA55-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; CORTEXA55-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; CORTEXA55-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; CORTEXA55-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; CORTEXA55-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; CORTEXA55-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; CORTEXA55-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; CORTEXA55-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; CORTEXA55-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; CORTEXA55-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; CORTEXA55-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
; CORTEXA55-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4
; CORTEXA55-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
; CORTEXA55-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
; CORTEXA55: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; CORTEXA55-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT: br label %[[EXIT_UNR_LCSSA]]
; CORTEXA55: [[EXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; CORTEXA55: [[FOR_BODY_EPIL_PREHEADER]]:
; CORTEXA55-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; CORTEXA55: [[FOR_BODY_EPIL]]:
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; CORTEXA55-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16
; CORTEXA55-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
; CORTEXA55-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
; CORTEXA55: [[FOR_BODY_EPIL_1]]:
; CORTEXA55-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; CORTEXA55-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
; CORTEXA55-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16
; CORTEXA55-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
; CORTEXA55-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
; CORTEXA55: [[FOR_BODY_EPIL_2]]:
; CORTEXA55-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; CORTEXA55-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
; CORTEXA55-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16
; CORTEXA55-NEXT: br label %[[EXIT_EPILOG_LCSSA]]
; CORTEXA55: [[EXIT_EPILOG_LCSSA]]:
; CORTEXA55-NEXT: br label %[[EXIT]]
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
entry: ; preds = %entry
br label %for.body
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%1 = sub nsw i64 %len, %iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
store <4 x float> %2, ptr %arrayidx2, align 16
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %len
br i1 %exitcond.not, label %exit, label %for.body
exit: ; preds = %for.body, %entry
ret void
}
define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*:]]
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
; APPLE-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
; APPLE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
; APPLE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
; APPLE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*:]]
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
; CORTEXA55: [[VECTOR_BODY]]:
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
; CORTEXA55-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
; CORTEXA55-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; CORTEXA55-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 8
br i1 %3, label %exit, label %vector.body
exit: ; preds = %vector.body
ret void
}
define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av0(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
; CORTEXA55: [[VECTOR_BODY]]:
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4
; CORTEXA55-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
; CORTEXA55-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
; CORTEXA55-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
; CORTEXA55-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
; CORTEXA55-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
; CORTEXA55-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; CORTEXA55-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
; CORTEXA55-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
; CORTEXA55-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
; CORTEXA55-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
; CORTEXA55-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
; CORTEXA55-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16
; CORTEXA55-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]]
; CORTEXA55-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
; CORTEXA55-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
; CORTEXA55-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
; CORTEXA55-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
; CORTEXA55-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
; CORTEXA55-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
; CORTEXA55-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
; CORTEXA55-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
; CORTEXA55-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
; CORTEXA55-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
; CORTEXA55-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
; CORTEXA55-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
; CORTEXA55-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
; CORTEXA55-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
; CORTEXA55-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
; CORTEXA55-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
; CORTEXA55-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
; CORTEXA55-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
; CORTEXA55-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
; CORTEXA55-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
; CORTEXA55-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
; CORTEXA55-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
; CORTEXA55-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
; CORTEXA55-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
; CORTEXA55-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
; CORTEXA55-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
; CORTEXA55-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
; CORTEXA55-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
; CORTEXA55-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
; CORTEXA55-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
; CORTEXA55-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
; CORTEXA55-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
; CORTEXA55-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
; CORTEXA55-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
; CORTEXA55-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
; CORTEXA55-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
; CORTEXA55-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
; CORTEXA55-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
; CORTEXA55-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
; CORTEXA55-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
; CORTEXA55-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
; CORTEXA55-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
; CORTEXA55-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
; CORTEXA55-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
; CORTEXA55-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
; CORTEXA55-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
; CORTEXA55-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
; CORTEXA55-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
; CORTEXA55-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
; CORTEXA55-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
; CORTEXA55-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
; CORTEXA55-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
; CORTEXA55-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
; CORTEXA55-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
; CORTEXA55-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
; CORTEXA55-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
; CORTEXA55-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
; CORTEXA55-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
; CORTEXA55-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
; CORTEXA55-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
; CORTEXA55-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
; CORTEXA55-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %exit, label %vector.body
exit: ; preds = %vector.body
ret void
}
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
; CORTEXA55: [[VECTOR_BODY]]:
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CORTEXA55-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CORTEXA55-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %exit, label %vector.body, !llvm.loop !0
exit: ; preds = %vector.body
ret void
}
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
; vector loop.
define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) {
; APPLE-LABEL: define void @scalar_epilogue(
; APPLE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; APPLE: [[VECTOR_PH]]:
; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
; APPLE: [[VECTOR_BODY]]:
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; APPLE-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; APPLE-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; APPLE-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
; APPLE-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
; APPLE-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; APPLE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; APPLE: [[MIDDLE_BLOCK]]:
; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; APPLE: [[SCALAR_REMAINDER_PREHEADER]]:
; APPLE-NEXT: [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; APPLE-NEXT: br label %[[SCALAR_REMAINDER:.*]]
; APPLE: [[SCALAR_REMAINDER]]:
; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ]
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; APPLE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[SPLAT_SCALAR]]
; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP5:![0-9]+]]
; APPLE: [[EXIT_LOOPEXIT]]:
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: ret void
;
; CORTEXA55-LABEL: define void @scalar_epilogue(
; CORTEXA55-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT: [[ENTRY:.*]]:
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; CORTEXA55: [[VECTOR_PH]]:
; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
; CORTEXA55: [[VECTOR_BODY]]:
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CORTEXA55-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CORTEXA55-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CORTEXA55-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
; CORTEXA55-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
; CORTEXA55-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CORTEXA55-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CORTEXA55: [[MIDDLE_BLOCK]]:
; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER]]:
; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL_PREHEADER]]:
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL:.*]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
; CORTEXA55-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
; CORTEXA55-NEXT: [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
; CORTEXA55-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL_1]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
; CORTEXA55-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
; CORTEXA55-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
; CORTEXA55-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL_2]]:
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
; CORTEXA55-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
; CORTEXA55-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ]
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]]
; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT]]:
; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CORTEXA55-NEXT: [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]]
; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER_NEW]]:
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER:.*]]
; CORTEXA55: [[SCALAR_REMAINDER]]:
; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[SCALAR_REMAINDER]] ]
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; CORTEXA55-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; CORTEXA55-NEXT: [[INC:%.*]] = add nuw i64 [[I_06]], 1
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
; CORTEXA55-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
; CORTEXA55-NEXT: [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
; CORTEXA55-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
; CORTEXA55-NEXT: [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
; CORTEXA55-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
; CORTEXA55: [[EXIT_LOOPEXIT]]:
; CORTEXA55-NEXT: br label %[[EXIT]]
; CORTEXA55: [[EXIT]]:
; CORTEXA55-NEXT: ret void
;
entry:
%min.iters.check = icmp ult i64 %n, 32
br i1 %min.iters.check, label %scalar.remainder, label %vector.ph
vector.ph:
%n.vec = and i64 %n, -32
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0
%broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
br label %vector.body
vector.body:
%iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ]
%gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv
%gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16
%wide.load = load <16 x i8>, ptr %gep.p.iv, align 1
%wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1
%add.broadcast = add <16 x i8> %wide.load, %broadcast.splat
%add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat
store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1
store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1
%iv.next = add nuw i64 %iv, 32
%exit.cond = icmp eq i64 %iv.next, %n.vec
br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2
middle.block:
%cmp.n = icmp eq i64 %n, %n.vec
br i1 %cmp.n, label %exit, label %scalar.remainder
scalar.remainder:
%iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ]
%arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop
%scalar.load = load i8, ptr %arrayidx, align 1
%add = add i8 %scalar.load, %splat.scalar
store i8 %add, ptr %arrayidx, align 1
%inc = add nuw i64 %iv.scalar.loop, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3
exit:
ret void
}
!2 = distinct !{!2, !1}
!3 = distinct !{!3, !1}
;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
;.
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.