#147420 changed the unrolling preferences to permit unrolling of non-auto vectorized loops by checking for the isvectorized attribute, however when a loop is vectorized this attribute is put on both the vector loop and the scalar epilogue, so this change prevented the scalar epilogue from being unrolled. Restore the previous behaviour of unrolling the scalar epilogue by checking both for the isvectorized attribute and vector instructions in the loop.
691 lines
48 KiB
LLVM
691 lines
48 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt -p loop-unroll -mtriple=arm64-apple-macosx -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
|
|
; RUN: opt -p loop-unroll -mtriple=aarch64-unknown-linux -mcpu=cortex-a55 -S %s | FileCheck %s -check-prefix=CORTEXA55
|
|
|
|
define void @reverse(ptr %dst, ptr %src, i64 %len) {
|
|
; APPLE-LABEL: define void @reverse(
|
|
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; APPLE-NEXT: [[ENTRY:.*]]:
|
|
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
|
|
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
|
|
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7
|
|
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
|
|
; APPLE: [[ENTRY_NEW]]:
|
|
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
|
|
; APPLE-NEXT: br label %[[FOR_BODY:.*]]
|
|
; APPLE: [[FOR_BODY]]:
|
|
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
|
|
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
|
|
; APPLE-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
|
|
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
|
|
; APPLE-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
|
|
; APPLE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
|
|
; APPLE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
|
|
; APPLE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
|
|
; APPLE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
|
|
; APPLE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
|
|
; APPLE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
|
|
; APPLE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
|
|
; APPLE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
|
|
; APPLE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
|
|
; APPLE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
|
|
; APPLE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
|
|
; APPLE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
|
|
; APPLE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
|
|
; APPLE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
|
|
; APPLE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
|
|
; APPLE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
|
|
; APPLE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
|
|
; APPLE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
|
|
; APPLE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
|
|
; APPLE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
|
|
; APPLE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
|
|
; APPLE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
|
|
; APPLE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
|
|
; APPLE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
|
|
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
|
|
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
|
|
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
|
|
; APPLE: [[EXIT_UNR_LCSSA]]:
|
|
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
|
|
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
|
|
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
|
|
; APPLE: [[FOR_BODY_EPIL_PREHEADER]]:
|
|
; APPLE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
|
|
; APPLE: [[FOR_BODY_EPIL]]:
|
|
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ]
|
|
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ]
|
|
; APPLE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_EPIL]]
|
|
; APPLE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
|
|
; APPLE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
|
|
; APPLE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_EPIL]]
|
|
; APPLE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
|
|
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
|
|
; APPLE-NEXT: [[EXITCOND_NOT_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[LEN]]
|
|
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
|
|
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
|
|
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; APPLE: [[EXIT_EPILOG_LCSSA]]:
|
|
; APPLE-NEXT: br label %[[EXIT]]
|
|
; APPLE: [[EXIT]]:
|
|
; APPLE-NEXT: ret void
|
|
;
|
|
; CORTEXA55-LABEL: define void @reverse(
|
|
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CORTEXA55-NEXT: [[ENTRY:.*]]:
|
|
; CORTEXA55-NEXT: [[TMP0:%.*]] = add i64 [[LEN]], -1
|
|
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 3
|
|
; CORTEXA55-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
|
|
; CORTEXA55-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
|
|
; CORTEXA55: [[ENTRY_NEW]]:
|
|
; CORTEXA55-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
|
|
; CORTEXA55-NEXT: br label %[[FOR_BODY:.*]]
|
|
; CORTEXA55: [[FOR_BODY]]:
|
|
; CORTEXA55-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[FOR_BODY]] ]
|
|
; CORTEXA55-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ]
|
|
; CORTEXA55-NEXT: [[TMP2:%.*]] = sub nsw i64 [[LEN]], [[IV]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP2]]
|
|
; CORTEXA55-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[ARRAYIDX2]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
|
|
; CORTEXA55-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
|
|
; CORTEXA55-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
|
|
; CORTEXA55-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
|
|
; CORTEXA55-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
|
|
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
|
|
; CORTEXA55-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
|
|
; CORTEXA55-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4
|
|
; CORTEXA55-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
|
|
; CORTEXA55-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[FOR_BODY]]
|
|
; CORTEXA55: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
|
|
; CORTEXA55-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[FOR_BODY]] ]
|
|
; CORTEXA55-NEXT: br label %[[EXIT_UNR_LCSSA]]
|
|
; CORTEXA55: [[EXIT_UNR_LCSSA]]:
|
|
; CORTEXA55-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
|
|
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
|
|
; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
|
|
; CORTEXA55: [[FOR_BODY_EPIL_PREHEADER]]:
|
|
; CORTEXA55-NEXT: br label %[[FOR_BODY_EPIL:.*]]
|
|
; CORTEXA55: [[FOR_BODY_EPIL]]:
|
|
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
|
|
; CORTEXA55-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_EPIL]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
|
|
; CORTEXA55-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
|
|
; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
|
|
; CORTEXA55: [[FOR_BODY_EPIL_1]]:
|
|
; CORTEXA55-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
|
|
; CORTEXA55-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_EPIL_1]], align 16
|
|
; CORTEXA55-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
|
|
; CORTEXA55-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
|
|
; CORTEXA55-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
|
|
; CORTEXA55: [[FOR_BODY_EPIL_2]]:
|
|
; CORTEXA55-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
|
|
; CORTEXA55-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
|
|
; CORTEXA55-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_EPIL_2]], align 16
|
|
; CORTEXA55-NEXT: br label %[[EXIT_EPILOG_LCSSA]]
|
|
; CORTEXA55: [[EXIT_EPILOG_LCSSA]]:
|
|
; CORTEXA55-NEXT: br label %[[EXIT]]
|
|
; CORTEXA55: [[EXIT]]:
|
|
; CORTEXA55-NEXT: ret void
|
|
;
|
|
entry: ; preds = %entry
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
|
|
%1 = sub nsw i64 %len, %iv
|
|
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
|
|
%2 = load <4 x float>, ptr %arrayidx, align 16
|
|
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
|
|
store <4 x float> %2, ptr %arrayidx2, align 16
|
|
%iv.next = add nuw nsw i64 %iv, 1
|
|
%exitcond.not = icmp eq i64 %iv.next, %len
|
|
br i1 %exitcond.not, label %exit, label %for.body
|
|
|
|
exit: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
|
|
; APPLE-LABEL: define void @saxpy_tripcount8_full_unroll(
|
|
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; APPLE-NEXT: [[ENTRY:.*:]]
|
|
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; APPLE: [[VECTOR_BODY]]:
|
|
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
|
|
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
|
|
; APPLE-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; APPLE-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
|
|
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
|
|
; APPLE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; APPLE-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
|
|
; APPLE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
|
|
; APPLE-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
|
|
; APPLE-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
|
|
; APPLE-NEXT: ret void
|
|
;
|
|
; CORTEXA55-LABEL: define void @saxpy_tripcount8_full_unroll(
|
|
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; CORTEXA55-NEXT: [[ENTRY:.*:]]
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CORTEXA55: [[VECTOR_BODY]]:
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
|
|
; CORTEXA55-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
|
|
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; CORTEXA55-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
|
|
; CORTEXA55-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
|
|
; CORTEXA55-NEXT: ret void
|
|
;
|
|
entry:
|
|
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
|
|
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
|
|
%wide.load = load <4 x float>, ptr %0, align 4
|
|
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
|
|
%wide.load12 = load <4 x float>, ptr %1, align 4
|
|
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
|
|
store <4 x float> %2, ptr %1, align 4
|
|
%index.next = add nuw i64 %index, 4
|
|
%3 = icmp eq i64 %index.next, 8
|
|
br i1 %3, label %exit, label %vector.body
|
|
|
|
exit: ; preds = %vector.body
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
|
|
; APPLE-LABEL: define void @saxpy_tripcount1K_av0(
|
|
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; APPLE-NEXT: [[ENTRY:.*]]:
|
|
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; APPLE: [[VECTOR_BODY]]:
|
|
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
|
|
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
|
|
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
|
|
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
|
|
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
|
|
; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
|
|
; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
|
|
; APPLE: [[EXIT]]:
|
|
; APPLE-NEXT: ret void
|
|
;
|
|
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av0(
|
|
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; CORTEXA55-NEXT: [[ENTRY:.*]]:
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CORTEXA55: [[VECTOR_BODY]]:
|
|
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
|
|
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
|
|
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 4
|
|
; CORTEXA55-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
|
|
; CORTEXA55-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
|
|
; CORTEXA55-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
|
|
; CORTEXA55-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
|
|
; CORTEXA55-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
|
|
; CORTEXA55-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
|
|
; CORTEXA55-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
|
|
; CORTEXA55-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
|
|
; CORTEXA55-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_3:%.*]] = add nuw nsw i64 [[INDEX]], 16
|
|
; CORTEXA55-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_3]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
|
|
; CORTEXA55-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_3]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
|
|
; CORTEXA55-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
|
|
; CORTEXA55-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
|
|
; CORTEXA55-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
|
|
; CORTEXA55-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
|
|
; CORTEXA55-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
|
|
; CORTEXA55-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
|
|
; CORTEXA55-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
|
|
; CORTEXA55-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
|
|
; CORTEXA55-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
|
|
; CORTEXA55-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
|
|
; CORTEXA55-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
|
|
; CORTEXA55-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
|
|
; CORTEXA55-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
|
|
; CORTEXA55-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
|
|
; CORTEXA55-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
|
|
; CORTEXA55-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
|
|
; CORTEXA55-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
|
|
; CORTEXA55-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
|
|
; CORTEXA55-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
|
|
; CORTEXA55-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
|
|
; CORTEXA55-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
|
|
; CORTEXA55-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
|
|
; CORTEXA55-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
|
|
; CORTEXA55-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
|
|
; CORTEXA55-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
|
|
; CORTEXA55-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
|
|
; CORTEXA55-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
|
|
; CORTEXA55-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
|
|
; CORTEXA55-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
|
|
; CORTEXA55-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
|
|
; CORTEXA55-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
|
|
; CORTEXA55-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
|
|
; CORTEXA55-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
|
|
; CORTEXA55-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
|
|
; CORTEXA55-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
|
|
; CORTEXA55-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
|
|
; CORTEXA55: [[EXIT]]:
|
|
; CORTEXA55-NEXT: ret void
|
|
;
|
|
entry:
|
|
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
|
|
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
|
|
%wide.load = load <4 x float>, ptr %0, align 4
|
|
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
|
|
%wide.load12 = load <4 x float>, ptr %1, align 4
|
|
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
|
|
store <4 x float> %2, ptr %1, align 4
|
|
%index.next = add nuw i64 %index, 4
|
|
%3 = icmp eq i64 %index.next, 1024
|
|
br i1 %3, label %exit, label %vector.body
|
|
|
|
exit: ; preds = %vector.body
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
|
|
; APPLE-LABEL: define void @saxpy_tripcount1K_av1(
|
|
; APPLE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; APPLE-NEXT: [[ENTRY:.*]]:
|
|
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; APPLE: [[VECTOR_BODY]]:
|
|
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
|
|
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
|
|
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
|
|
; APPLE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; APPLE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; APPLE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
|
|
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
|
|
; APPLE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
|
|
; APPLE-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
|
|
; APPLE: [[EXIT]]:
|
|
; APPLE-NEXT: ret void
|
|
;
|
|
; CORTEXA55-LABEL: define void @saxpy_tripcount1K_av1(
|
|
; CORTEXA55-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
|
|
; CORTEXA55-NEXT: [[ENTRY:.*]]:
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
|
|
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CORTEXA55: [[VECTOR_BODY]]:
|
|
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
|
|
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
|
|
; CORTEXA55-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
|
|
; CORTEXA55-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
|
|
; CORTEXA55-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
|
|
; CORTEXA55-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
|
|
; CORTEXA55: [[EXIT]]:
|
|
; CORTEXA55-NEXT: ret void
|
|
;
|
|
entry:
|
|
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
|
|
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %entry
|
|
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
|
|
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
|
|
%wide.load = load <4 x float>, ptr %0, align 4
|
|
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
|
|
%wide.load12 = load <4 x float>, ptr %1, align 4
|
|
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
|
|
store <4 x float> %2, ptr %1, align 4
|
|
%index.next = add nuw i64 %index, 4
|
|
%3 = icmp eq i64 %index.next, 1024
|
|
br i1 %3, label %exit, label %vector.body, !llvm.loop !0
|
|
|
|
exit: ; preds = %vector.body
|
|
ret void
|
|
}
|
|
!0 = !{!0, !1}
|
|
!1 = !{!"llvm.loop.isvectorized", i32 1}
|
|
|
|
; On Cortex-A55 we should runtime unroll the scalar epilogue loop, but not the
|
|
; vector loop.
|
|
define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) {
|
|
; APPLE-LABEL: define void @scalar_epilogue(
|
|
; APPLE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
|
|
; APPLE-NEXT: [[ENTRY:.*]]:
|
|
; APPLE-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
|
|
; APPLE-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
|
|
; APPLE: [[VECTOR_PH]]:
|
|
; APPLE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
|
|
; APPLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
|
|
; APPLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
|
|
; APPLE-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; APPLE: [[VECTOR_BODY]]:
|
|
; APPLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; APPLE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
|
|
; APPLE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
|
|
; APPLE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
|
|
; APPLE-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
|
|
; APPLE-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
|
|
; APPLE-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
|
|
; APPLE-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
|
|
; APPLE-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
|
|
; APPLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
|
|
; APPLE-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; APPLE-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
|
|
; APPLE: [[MIDDLE_BLOCK]]:
|
|
; APPLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; APPLE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
|
|
; APPLE: [[SCALAR_REMAINDER_PREHEADER]]:
|
|
; APPLE-NEXT: [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
|
|
; APPLE-NEXT: br label %[[SCALAR_REMAINDER:.*]]
|
|
; APPLE: [[SCALAR_REMAINDER]]:
|
|
; APPLE-NEXT: [[I_06:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ]
|
|
; APPLE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
|
|
; APPLE-NEXT: [[TMP8:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
|
|
; APPLE-NEXT: [[ADD:%.*]] = add i8 [[TMP8]], [[SPLAT_SCALAR]]
|
|
; APPLE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
|
|
; APPLE-NEXT: [[INC]] = add nuw i64 [[I_06]], 1
|
|
; APPLE-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
|
|
; APPLE-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP5:![0-9]+]]
|
|
; APPLE: [[EXIT_LOOPEXIT]]:
|
|
; APPLE-NEXT: br label %[[EXIT]]
|
|
; APPLE: [[EXIT]]:
|
|
; APPLE-NEXT: ret void
|
|
;
|
|
; CORTEXA55-LABEL: define void @scalar_epilogue(
|
|
; CORTEXA55-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
|
|
; CORTEXA55-NEXT: [[ENTRY:.*]]:
|
|
; CORTEXA55-NEXT: [[MIN_ITERS_CHECK7:%.*]] = icmp ult i64 [[N]], 32
|
|
; CORTEXA55-NEXT: br i1 [[MIN_ITERS_CHECK7]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
|
|
; CORTEXA55: [[VECTOR_PH]]:
|
|
; CORTEXA55-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
|
|
; CORTEXA55-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
|
|
; CORTEXA55-NEXT: br label %[[VECTOR_BODY:.*]]
|
|
; CORTEXA55: [[VECTOR_BODY]]:
|
|
; CORTEXA55-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
|
|
; CORTEXA55-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INDEX]]
|
|
; CORTEXA55-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
|
|
; CORTEXA55-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
|
|
; CORTEXA55-NEXT: [[TMP2:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
|
|
; CORTEXA55-NEXT: [[TMP3:%.*]] = add <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
|
|
; CORTEXA55-NEXT: store <16 x i8> [[TMP2]], ptr [[TMP0]], align 1
|
|
; CORTEXA55-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP1]], align 1
|
|
; CORTEXA55-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
|
|
; CORTEXA55-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
|
|
; CORTEXA55-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
|
|
; CORTEXA55: [[MIDDLE_BLOCK]]:
|
|
; CORTEXA55-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
|
|
; CORTEXA55-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER]]:
|
|
; CORTEXA55-NEXT: [[I_06_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
|
|
; CORTEXA55-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[I_06_PH]]
|
|
; CORTEXA55-NEXT: [[TMP9:%.*]] = add i64 [[N]], -1
|
|
; CORTEXA55-NEXT: [[TMP10:%.*]] = sub i64 [[TMP9]], [[I_06_PH]]
|
|
; CORTEXA55-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP8]], 3
|
|
; CORTEXA55-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
|
|
; CORTEXA55-NEXT: br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL_PREHEADER]]:
|
|
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL:.*]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL]]:
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06_PH]]
|
|
; CORTEXA55-NEXT: [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX_PROL]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_PROL:%.*]] = add i8 [[TMP11]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_PROL]], ptr [[ARRAYIDX_PROL]], align 1
|
|
; CORTEXA55-NEXT: [[INC_PROL:%.*]] = add nuw i64 [[I_06_PH]], 1
|
|
; CORTEXA55-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
|
|
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL_1]]:
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL]]
|
|
; CORTEXA55-NEXT: [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[TMP12]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
|
|
; CORTEXA55-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[I_06_PH]], 2
|
|
; CORTEXA55-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
|
|
; CORTEXA55-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL_2]]:
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
|
|
; CORTEXA55-NEXT: [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[TMP13]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
|
|
; CORTEXA55-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[I_06_PH]], 3
|
|
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]:
|
|
; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC_PROL]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ]
|
|
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PROL_LOOPEXIT]]:
|
|
; CORTEXA55-NEXT: [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[I_06_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ]
|
|
; CORTEXA55-NEXT: [[TMP14:%.*]] = icmp ult i64 [[TMP10]], 3
|
|
; CORTEXA55-NEXT: br i1 [[TMP14]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER_PREHEADER_NEW]]:
|
|
; CORTEXA55-NEXT: br label %[[SCALAR_REMAINDER:.*]]
|
|
; CORTEXA55: [[SCALAR_REMAINDER]]:
|
|
; CORTEXA55-NEXT: [[I_06:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_3:%.*]], %[[SCALAR_REMAINDER]] ]
|
|
; CORTEXA55-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[I_06]]
|
|
; CORTEXA55-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
|
|
; CORTEXA55-NEXT: [[ADD:%.*]] = add i8 [[TMP15]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
|
|
; CORTEXA55-NEXT: [[INC:%.*]] = add nuw i64 [[I_06]], 1
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
|
|
; CORTEXA55-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_1:%.*]] = add i8 [[TMP16]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
|
|
; CORTEXA55-NEXT: [[INC_1:%.*]] = add nuw i64 [[I_06]], 2
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
|
|
; CORTEXA55-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_2:%.*]] = add i8 [[TMP17]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
|
|
; CORTEXA55-NEXT: [[INC_2:%.*]] = add nuw i64 [[I_06]], 3
|
|
; CORTEXA55-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
|
|
; CORTEXA55-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
|
|
; CORTEXA55-NEXT: [[ADD_3:%.*]] = add i8 [[TMP18]], [[SPLAT_SCALAR]]
|
|
; CORTEXA55-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
|
|
; CORTEXA55-NEXT: [[INC_3]] = add nuw i64 [[I_06]], 4
|
|
; CORTEXA55-NEXT: [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INC_3]], [[N]]
|
|
; CORTEXA55-NEXT: br i1 [[EXITCOND_NOT_3]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
|
|
; CORTEXA55: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
|
|
; CORTEXA55-NEXT: br label %[[EXIT_LOOPEXIT]]
|
|
; CORTEXA55: [[EXIT_LOOPEXIT]]:
|
|
; CORTEXA55-NEXT: br label %[[EXIT]]
|
|
; CORTEXA55: [[EXIT]]:
|
|
; CORTEXA55-NEXT: ret void
|
|
;
|
|
entry:
|
|
%min.iters.check = icmp ult i64 %n, 32
|
|
br i1 %min.iters.check, label %scalar.remainder, label %vector.ph
|
|
|
|
vector.ph:
|
|
%n.vec = and i64 %n, -32
|
|
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0
|
|
%broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body:
|
|
%iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ]
|
|
%gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv
|
|
%gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16
|
|
%wide.load = load <16 x i8>, ptr %gep.p.iv, align 1
|
|
%wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1
|
|
%add.broadcast = add <16 x i8> %wide.load, %broadcast.splat
|
|
%add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat
|
|
store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1
|
|
store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1
|
|
%iv.next = add nuw i64 %iv, 32
|
|
%exit.cond = icmp eq i64 %iv.next, %n.vec
|
|
br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2
|
|
|
|
middle.block:
|
|
%cmp.n = icmp eq i64 %n, %n.vec
|
|
br i1 %cmp.n, label %exit, label %scalar.remainder
|
|
|
|
scalar.remainder:
|
|
%iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ]
|
|
%arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop
|
|
%scalar.load = load i8, ptr %arrayidx, align 1
|
|
%add = add i8 %scalar.load, %splat.scalar
|
|
store i8 %add, ptr %arrayidx, align 1
|
|
%inc = add nuw i64 %iv.scalar.loop, 1
|
|
%exitcond.not = icmp eq i64 %inc, %n
|
|
br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
!2 = distinct !{!2, !1}
|
|
!3 = distinct !{!3, !1}
|
|
|
|
;.
|
|
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
|
|
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
|
|
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]}
|
|
; APPLE: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
|
|
; APPLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META3]]}
|
|
; APPLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META3]]}
|
|
;.
|
|
; CORTEXA55: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
|
|
; CORTEXA55: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
|
|
; CORTEXA55: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
|
|
; CORTEXA55: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
|
|
;.
|