llvm-project/llvm/test/Transforms/LoopUnroll/AArch64/vector.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55

define void @reverse(ptr %dst, ptr %src, i64 %len) {
; APPLE-LABEL: define void @reverse(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; APPLE-NEXT:  [[ENTRY:.*]]:
; APPLE-NEXT:    [[TMP0:%.*]] = add i64 [[LEN]], -1
; APPLE-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; APPLE-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
; APPLE-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
; APPLE:       [[ENTRY_NEW]]:
; APPLE-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; APPLE-NEXT:    br label %[[FOR_BODY:.*]]
; APPLE:       [[FOR_BODY]]:
; APPLE-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
; APPLE-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
; APPLE-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; APPLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
; APPLE-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; APPLE-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; APPLE-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; APPLE-NEXT:    store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; APPLE-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT:    [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; APPLE-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; APPLE-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; APPLE-NEXT:    store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; APPLE-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; APPLE-NEXT:    [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; APPLE-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; APPLE-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; APPLE-NEXT:    store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; APPLE-NEXT:    [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
; APPLE-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
; APPLE-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; APPLE-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
; APPLE-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
; APPLE-NEXT:    [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
; APPLE-NEXT:    [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
; APPLE-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; APPLE-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
; APPLE-NEXT:    store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
; APPLE-NEXT:    [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
; APPLE-NEXT:    [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
; APPLE-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; APPLE-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
; APPLE-NEXT:    store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
; APPLE-NEXT:    [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
; APPLE-NEXT:    [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
; APPLE-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
; APPLE-NEXT:    [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
; APPLE-NEXT:    store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
; APPLE-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
; APPLE-NEXT:    [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; APPLE-NEXT:    [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; APPLE-NEXT:    br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]]
; APPLE:       [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT:    [[IV_UNR:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
; APPLE-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER]], label %[[EXIT:.*]]
; APPLE:       [[FOR_BODY_EPIL_PREHEADER]]:
; APPLE-NEXT:    [[IV_EPIL_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR]], %[[EXIT_UNR_LCSSA]] ]
; APPLE-NEXT:    [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT:    call void @llvm.assume(i1 [[LCMP_MOD1]])
; APPLE-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
; APPLE:       [[FOR_BODY_EPIL]]:
; APPLE-NEXT:    [[IV_EPIL:%.*]] = phi i64 [ [[IV_EPIL_INIT]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT:    [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
; APPLE-NEXT:    [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
; APPLE-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
; APPLE-NEXT:    [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; APPLE-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
; APPLE-NEXT:    store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
; APPLE-NEXT:    [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT:    [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
; APPLE-NEXT:    [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; APPLE:       [[EXIT_EPILOG_LCSSA]]:
; APPLE-NEXT:    br label %[[EXIT]]
; APPLE:       [[EXIT]]:
; APPLE-NEXT:    ret void
;
; CORTEXA55-LABEL: define void @reverse(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; CORTEXA55-NEXT:  [[ENTRY:.*]]:
; CORTEXA55-NEXT:    [[TMP0:%.*]] = add i64 [[LEN]], -1
; CORTEXA55-NEXT:    [[XTRAITER:%.*]] = and i64 [[LEN]], 3
; CORTEXA55-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
; CORTEXA55-NEXT:    br i1 [[TMP1]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
; CORTEXA55:       [[ENTRY_NEW]]:
; CORTEXA55-NEXT:    [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; CORTEXA55-NEXT:    br label %[[FOR_BODY:.*]]
; CORTEXA55:       [[FOR_BODY]]:
; CORTEXA55-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT:    [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT:    [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; CORTEXA55-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
; CORTEXA55-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; CORTEXA55-NEXT:    [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; CORTEXA55-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; CORTEXA55-NEXT:    [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; CORTEXA55-NEXT:    [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; CORTEXA55-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; CORTEXA55-NEXT:    [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; CORTEXA55-NEXT:    [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; CORTEXA55-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; CORTEXA55-NEXT:    [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
; CORTEXA55-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
; CORTEXA55-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
; CORTEXA55-NEXT:    br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]]
; CORTEXA55:       [[EXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT:    [[IV_UNR1:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ]
; CORTEXA55-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CORTEXA55-NEXT:    br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER]], label %[[EXIT:.*]]
; CORTEXA55:       [[FOR_BODY_EPIL_PREHEADER]]:
; CORTEXA55-NEXT:    [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR1]], %[[EXIT_UNR_LCSSA]] ]
; CORTEXA55-NEXT:    [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CORTEXA55-NEXT:    call void @llvm.assume(i1 [[LCMP_MOD1]])
; CORTEXA55-NEXT:    br label %[[FOR_BODY_EPIL:.*]]
; CORTEXA55:       [[FOR_BODY_EPIL]]:
; CORTEXA55-NEXT:    [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; CORTEXA55-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
; CORTEXA55-NEXT:    [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; CORTEXA55-NEXT:    br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
; CORTEXA55:       [[FOR_BODY_EPIL_1]]:
; CORTEXA55-NEXT:    [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; CORTEXA55-NEXT:    [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16
; CORTEXA55-NEXT:    [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
; CORTEXA55-NEXT:    [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; CORTEXA55-NEXT:    br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
; CORTEXA55:       [[FOR_BODY_EPIL_2]]:
; CORTEXA55-NEXT:    [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
; CORTEXA55-NEXT:    [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; CORTEXA55-NEXT:    [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
; CORTEXA55-NEXT:    [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
; CORTEXA55-NEXT:    store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16
; CORTEXA55-NEXT:    br label %[[EXIT_EPILOG_LCSSA]]
; CORTEXA55:       [[EXIT_EPILOG_LCSSA]]:
; CORTEXA55-NEXT:    br label %[[EXIT]]
; CORTEXA55:       [[EXIT]]:
; CORTEXA55-NEXT:    ret void
;
entry:                               ; preds = %entry
  br label %for.body

for.body:                                         ; preds = %entry, %for.body
  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
  %1 = sub nsw i64 %len, %iv
  %arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
  %2 = load <4 x float>, ptr %arrayidx, align 16
  %arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
  store <4 x float> %2, ptr %arrayidx2, align 16
  %iv.next = add nuw nsw i64 %iv, 1
  %exitcond.not = icmp eq i64 %iv.next, %len
  br i1 %exitcond.not, label %exit, label %for.body

exit:                                 ; preds = %for.body, %entry
  ret void
}


define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT:  [[ENTRY:.*:]]
; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
; APPLE:       [[VECTOR_BODY]]:
; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
; APPLE-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
; APPLE-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
; APPLE-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
; APPLE-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; APPLE-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; APPLE-NEXT:    ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT:  [[ENTRY:.*:]]
; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
; CORTEXA55:       [[VECTOR_BODY]]:
; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
; CORTEXA55-NEXT:    [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP0]], ptr [[DST]], align 4
; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
; CORTEXA55-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
; CORTEXA55-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
; CORTEXA55-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; CORTEXA55-NEXT:    ret void
;
entry:
  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
  %wide.load = load <4 x float>, ptr %0, align 4
  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
  %wide.load12 = load <4 x float>, ptr %1, align 4
  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
  store <4 x float> %2, ptr %1, align 4
  %index.next = add nuw i64 %index, 4
  %3 = icmp eq i64 %index.next, 8
  br i1 %3, label %exit, label %vector.body

exit:                                 ; preds = %vector.body
  ret void
}


define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av0(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT:  [[ENTRY:.*]]:
; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
; APPLE:       [[VECTOR_BODY]]:
; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; APPLE-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; APPLE:       [[EXIT]]:
; APPLE-NEXT:    ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT:  [[ENTRY:.*]]:
; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
; CORTEXA55:       [[VECTOR_BODY]]:
; CORTEXA55-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4
; CORTEXA55-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
; CORTEXA55-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
; CORTEXA55-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
; CORTEXA55-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
; CORTEXA55-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; CORTEXA55-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
; CORTEXA55-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
; CORTEXA55-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
; CORTEXA55-NEXT:    [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16
; CORTEXA55-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
; CORTEXA55-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
; CORTEXA55-NEXT:    [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
; CORTEXA55-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
; CORTEXA55-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
; CORTEXA55-NEXT:    [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
; CORTEXA55-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
; CORTEXA55-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
; CORTEXA55-NEXT:    [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
; CORTEXA55-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
; CORTEXA55-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
; CORTEXA55-NEXT:    [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
; CORTEXA55-NEXT:    [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
; CORTEXA55-NEXT:    [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
; CORTEXA55-NEXT:    [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
; CORTEXA55-NEXT:    [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
; CORTEXA55-NEXT:    [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
; CORTEXA55-NEXT:    [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
; CORTEXA55-NEXT:    [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
; CORTEXA55-NEXT:    [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
; CORTEXA55-NEXT:    [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
; CORTEXA55-NEXT:    [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
; CORTEXA55-NEXT:    [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
; CORTEXA55-NEXT:    [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
; CORTEXA55-NEXT:    [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
; CORTEXA55-NEXT:    [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
; CORTEXA55-NEXT:    [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
; CORTEXA55-NEXT:    [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
; CORTEXA55-NEXT:    [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
; CORTEXA55-NEXT:    [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
; CORTEXA55-NEXT:    [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
; CORTEXA55-NEXT:    [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
; CORTEXA55-NEXT:    [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
; CORTEXA55-NEXT:    [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
; CORTEXA55-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
; CORTEXA55-NEXT:    [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
; CORTEXA55-NEXT:    [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
; CORTEXA55-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
; CORTEXA55-NEXT:    br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; CORTEXA55:       [[EXIT]]:
; CORTEXA55-NEXT:    ret void
;
entry:
  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
  %wide.load = load <4 x float>, ptr %0, align 4
  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
  %wide.load12 = load <4 x float>, ptr %1, align 4
  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
  store <4 x float> %2, ptr %1, align 4
  %index.next = add nuw i64 %index, 4
  %3 = icmp eq i64 %index.next, 1024
  br i1 %3, label %exit, label %vector.body

exit:                                             ; preds = %vector.body
  ret void
}


define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; APPLE-NEXT:  [[ENTRY:.*]]:
; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
; APPLE:       [[VECTOR_BODY]]:
; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; APPLE-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; APPLE-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; APPLE-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; APPLE-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; APPLE-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; APPLE:       [[EXIT]]:
; APPLE-NEXT:    ret void
;
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1(
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT:  [[ENTRY:.*]]:
; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
; CORTEXA55:       [[VECTOR_BODY]]:
; CORTEXA55-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; CORTEXA55-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CORTEXA55-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CORTEXA55-NEXT:    store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; CORTEXA55-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CORTEXA55-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CORTEXA55-NEXT:    br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CORTEXA55:       [[EXIT]]:
; CORTEXA55-NEXT:    ret void
;
entry:
  %broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
  %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %entry
  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
  %0 = getelementptr inbounds nuw float, ptr %src, i64 %index
  %wide.load = load <4 x float>, ptr %0, align 4
  %1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
  %wide.load12 = load <4 x float>, ptr %1, align 4
  %2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
  store <4 x float> %2, ptr %1, align 4
  %index.next = add nuw i64 %index, 4
  %3 = icmp eq i64 %index.next, 1024
  br i1 %3, label %exit, label %vector.body, !llvm.loop !0

exit:                                 ; preds = %vector.body
  ret void
}
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}

; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
; vector loop.
define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) {
; APPLE-LABEL: define void @scalar_epilogue(
; APPLE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; APPLE-NEXT:  [[ENTRY:.*]]:
; APPLE-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
; APPLE-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; APPLE:       [[VECTOR_PH]]:
; APPLE-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -32
; APPLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; APPLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; APPLE-NEXT:    br label %[[VECTOR_BODY:.*]]
; APPLE:       [[VECTOR_BODY]]:
; APPLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; APPLE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
; APPLE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
; APPLE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; APPLE-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; APPLE-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; APPLE-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
; APPLE-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
; APPLE-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
; APPLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; APPLE-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; APPLE-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; APPLE:       [[MIDDLE_BLOCK]]:
; APPLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; APPLE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; APPLE:       [[SCALAR_REMAINDER_PREHEADER]]:
; APPLE-NEXT:    [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; APPLE-NEXT:    br label %[[SCALAR_REMAINDER:.*]]
; APPLE:       [[SCALAR_REMAINDER]]:
; APPLE-NEXT:    [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ]
; APPLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; APPLE-NEXT:    [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; APPLE-NEXT:    [[ADD:%.*]] = add i8 [[TMP8]], [[SPLAT_SCALAR]]
; APPLE-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; APPLE-NEXT:    [[INC]] = add nuw i64 [[I_06]], 1
; APPLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; APPLE-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP5:![0-9]+]]
; APPLE:       [[EXIT_LOOPEXIT]]:
; APPLE-NEXT:    br label %[[EXIT]]
; APPLE:       [[EXIT]]:
; APPLE-NEXT:    ret void
;
; CORTEXA55-LABEL: define void @scalar_epilogue(
; CORTEXA55-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CORTEXA55-NEXT:  [[ENTRY:.*]]:
; CORTEXA55-NEXT:    [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
; CORTEXA55-NEXT:    br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; CORTEXA55:       [[VECTOR_PH]]:
; CORTEXA55-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], -32
; CORTEXA55-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; CORTEXA55-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CORTEXA55-NEXT:    br label %[[VECTOR_BODY:.*]]
; CORTEXA55:       [[VECTOR_BODY]]:
; CORTEXA55-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CORTEXA55-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
; CORTEXA55-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
; CORTEXA55-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
; CORTEXA55-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CORTEXA55-NEXT:    [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CORTEXA55-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
; CORTEXA55-NEXT:    store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
; CORTEXA55-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
; CORTEXA55-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
; CORTEXA55-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CORTEXA55-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CORTEXA55:       [[MIDDLE_BLOCK]]:
; CORTEXA55-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CORTEXA55-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; CORTEXA55:       [[SCALAR_REMAINDER_PREHEADER]]:
; CORTEXA55-NEXT:    [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; CORTEXA55-NEXT:    [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
; CORTEXA55-NEXT:    [[TMP9:%.*]] = add i64 [[N]], -1
; CORTEXA55-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
; CORTEXA55-NEXT:    [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
; CORTEXA55-NEXT:    [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CORTEXA55-NEXT:    br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL_PREHEADER]]:
; CORTEXA55-NEXT:    br label %[[SCALAR_REMAINDER_PROL:.*]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL]]:
; CORTEXA55-NEXT:    [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
; CORTEXA55-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
; CORTEXA55-NEXT:    [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
; CORTEXA55-NEXT:    [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
; CORTEXA55-NEXT:    [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; CORTEXA55-NEXT:    br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL_1]]:
; CORTEXA55-NEXT:    [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
; CORTEXA55-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
; CORTEXA55-NEXT:    [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
; CORTEXA55-NEXT:    [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
; CORTEXA55-NEXT:    [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; CORTEXA55-NEXT:    br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL_2]]:
; CORTEXA55-NEXT:    [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
; CORTEXA55-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
; CORTEXA55-NEXT:    [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
; CORTEXA55-NEXT:    [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
; CORTEXA55-NEXT:    br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT:    [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ]
; CORTEXA55-NEXT:    br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]]
; CORTEXA55:       [[SCALAR_REMAINDER_PROL_LOOPEXIT]]:
; CORTEXA55-NEXT:    [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CORTEXA55-NEXT:    [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
; CORTEXA55-NEXT:    br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]]
; CORTEXA55:       [[SCALAR_REMAINDER_PREHEADER_NEW]]:
; CORTEXA55-NEXT:    br label %[[SCALAR_REMAINDER:.*]]
; CORTEXA55:       [[SCALAR_REMAINDER]]:
; CORTEXA55-NEXT:    [[I_06:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[SCALAR_REMAINDER]] ]
; CORTEXA55-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
; CORTEXA55-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CORTEXA55-NEXT:    [[ADD:%.*]] = add i8 [[TMP15]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; CORTEXA55-NEXT:    [[INC:%.*]] = add nuw i64 [[I_06]], 1
; CORTEXA55-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
; CORTEXA55-NEXT:    [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
; CORTEXA55-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP16]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
; CORTEXA55-NEXT:    [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
; CORTEXA55-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
; CORTEXA55-NEXT:    [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
; CORTEXA55-NEXT:    [[ADD_2:%.*]] = add i8 [[TMP17]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
; CORTEXA55-NEXT:    [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
; CORTEXA55-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
; CORTEXA55-NEXT:    [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT:    [[ADD_3:%.*]] = add i8 [[TMP18]], [[SPLAT_SCALAR]]
; CORTEXA55-NEXT:    store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
; CORTEXA55-NEXT:    [[INC_3]] = add nuw i64 [[I_06]], 4
; CORTEXA55-NEXT:    [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
; CORTEXA55-NEXT:    br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
; CORTEXA55:       [[EXIT_LOOPEXIT_UNR_LCSSA]]:
; CORTEXA55-NEXT:    br label %[[EXIT_LOOPEXIT]]
; CORTEXA55:       [[EXIT_LOOPEXIT]]:
; CORTEXA55-NEXT:    br label %[[EXIT]]
; CORTEXA55:       [[EXIT]]:
; CORTEXA55-NEXT:    ret void
;
entry:
  %min.iters.check = icmp ult i64 %n, 32
  br i1 %min.iters.check, label %scalar.remainder, label %vector.ph

vector.ph:
  %n.vec = and i64 %n, -32
  %broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0
  %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
  br label %vector.body

vector.body:
  %iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ]
  %gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv
  %gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16
  %wide.load = load <16 x i8>, ptr %gep.p.iv, align 1
  %wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1
  %add.broadcast = add <16 x i8> %wide.load, %broadcast.splat
  %add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat
  store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1
  store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1
  %iv.next = add nuw i64 %iv, 32
  %exit.cond = icmp eq i64 %iv.next, %n.vec
  br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2

middle.block:
  %cmp.n = icmp eq i64 %n, %n.vec
  br i1 %cmp.n, label %exit, label %scalar.remainder

scalar.remainder:
  %iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ]
  %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop
  %scalar.load = load i8, ptr %arrayidx, align 1
  %add = add i8 %scalar.load, %splat.scalar
  store i8 %add, ptr %arrayidx, align 1
  %inc = add nuw i64 %iv.scalar.loop, 1
  %exitcond.not = icmp eq i64 %inc, %n
  br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3

exit:
  ret void
}

!2 = distinct !{!2, !1}
!3 = distinct !{!3, !1}

;.
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
;.
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.