Pengcheng Wang a0b6638c85
[RISCV] Don't unroll vectorized loops with vector operands (#171089)
We have disabled unrolling for vectorized loops in #151525 but this
PR only checked the instruction type.

For some loops, there is no instruction with vector type but they
are still vector operations (just like the memset zero test in the
precommit test).

Here we check the operands as well to cover these cases.
2025-12-09 12:42:41 +08:00

726 lines
50 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -p loop-unroll -mtriple riscv64 -mattr=+v,+f -S %s | FileCheck %s --check-prefixes=COMMON,CHECK
; RUN: opt -p loop-unroll -mtriple=riscv64 -mcpu=sifive-p870 -S %s | FileCheck %s --check-prefixes=COMMON,SIFIVE
define void @reverse(ptr %dst, ptr %src, i64 %len) {
; CHECK-LABEL: define void @reverse(
; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
; CHECK: [[FOR_BODY]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; CHECK-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[LEN]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[FOR_BODY]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
; SIFIVE-LABEL: define void @reverse(
; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i64 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
; SIFIVE-NEXT: [[ENTRY:.*]]:
; SIFIVE-NEXT: [[TMP2:%.*]] = add i64 [[LEN]], -1
; SIFIVE-NEXT: [[XTRAITER:%.*]] = and i64 [[LEN]], 7
; SIFIVE-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 7
; SIFIVE-NEXT: br i1 [[TMP3]], label %[[FOR_BODY_EPIL_PREHEADER:.*]], label %[[ENTRY_NEW:.*]]
; SIFIVE: [[ENTRY_NEW]]:
; SIFIVE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[LEN]], [[XTRAITER]]
; SIFIVE-NEXT: br label %[[FOR_BODY:.*]]
; SIFIVE: [[FOR_BODY]]:
; SIFIVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ]
; SIFIVE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[FOR_BODY]] ]
; SIFIVE-NEXT: [[TMP0:%.*]] = sub nsw i64 [[LEN]], [[IV]]
; SIFIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP0]]
; SIFIVE-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV]]
; SIFIVE-NEXT: store <4 x float> [[TMP1]], ptr [[ARRAYIDX2]], align 16
; SIFIVE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; SIFIVE-NEXT: [[TMP4:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT]]
; SIFIVE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP4]]
; SIFIVE-NEXT: [[TMP5:%.*]] = load <4 x float>, ptr [[ARRAYIDX_1]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT]]
; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[ARRAYIDX2_1]], align 16
; SIFIVE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; SIFIVE-NEXT: [[TMP6:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_1]]
; SIFIVE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP6]]
; SIFIVE-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX_2]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_1]]
; SIFIVE-NEXT: store <4 x float> [[TMP7]], ptr [[ARRAYIDX2_2]], align 16
; SIFIVE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; SIFIVE-NEXT: [[TMP8:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_2]]
; SIFIVE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP8]]
; SIFIVE-NEXT: [[TMP9:%.*]] = load <4 x float>, ptr [[ARRAYIDX_3]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_2]]
; SIFIVE-NEXT: store <4 x float> [[TMP9]], ptr [[ARRAYIDX2_3]], align 16
; SIFIVE-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4
; SIFIVE-NEXT: [[TMP10:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_3]]
; SIFIVE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP10]]
; SIFIVE-NEXT: [[TMP11:%.*]] = load <4 x float>, ptr [[ARRAYIDX_4]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_3]]
; SIFIVE-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX2_4]], align 16
; SIFIVE-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5
; SIFIVE-NEXT: [[TMP12:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_4]]
; SIFIVE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP12]]
; SIFIVE-NEXT: [[TMP13:%.*]] = load <4 x float>, ptr [[ARRAYIDX_5]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_4]]
; SIFIVE-NEXT: store <4 x float> [[TMP13]], ptr [[ARRAYIDX2_5]], align 16
; SIFIVE-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6
; SIFIVE-NEXT: [[TMP14:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_5]]
; SIFIVE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP14]]
; SIFIVE-NEXT: [[TMP15:%.*]] = load <4 x float>, ptr [[ARRAYIDX_6]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_5]]
; SIFIVE-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX2_6]], align 16
; SIFIVE-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7
; SIFIVE-NEXT: [[TMP16:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_6]]
; SIFIVE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP16]]
; SIFIVE-NEXT: [[TMP17:%.*]] = load <4 x float>, ptr [[ARRAYIDX_7]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_6]]
; SIFIVE-NEXT: store <4 x float> [[TMP17]], ptr [[ARRAYIDX2_7]], align 16
; SIFIVE-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
; SIFIVE-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8
; SIFIVE-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]]
; SIFIVE-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA:.*]], label %[[FOR_BODY]]
; SIFIVE: [[EXIT_UNR_LCSSA]]:
; SIFIVE-NEXT: [[IV_UNR1:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[FOR_BODY]] ]
; SIFIVE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; SIFIVE-NEXT: br i1 [[LCMP_MOD]], label %[[FOR_BODY_EPIL_PREHEADER]], label %[[EXIT:.*]]
; SIFIVE: [[FOR_BODY_EPIL_PREHEADER]]:
; SIFIVE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR1]], %[[EXIT_UNR_LCSSA]] ]
; SIFIVE-NEXT: [[LCMP_MOD1:%.*]] = icmp ne i64 [[XTRAITER]], 0
; SIFIVE-NEXT: call void @llvm.assume(i1 [[LCMP_MOD1]])
; SIFIVE-NEXT: br label %[[FOR_BODY_EPIL:.*]]
; SIFIVE: [[FOR_BODY_EPIL]]:
; SIFIVE-NEXT: [[TMP18:%.*]] = sub nsw i64 [[LEN]], [[IV_UNR]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP18]]
; SIFIVE-NEXT: [[TMP19:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_UNR]]
; SIFIVE-NEXT: store <4 x float> [[TMP19]], ptr [[ARRAYIDX2_EPIL]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[IV_UNR]], 1
; SIFIVE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[FOR_BODY_EPIL_1:.*]], label %[[EXIT_EPILOG_LCSSA:.*]]
; SIFIVE: [[FOR_BODY_EPIL_1]]:
; SIFIVE-NEXT: [[TMP20:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP20]]
; SIFIVE-NEXT: [[TMP21:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_1]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL]]
; SIFIVE-NEXT: store <4 x float> [[TMP21]], ptr [[ARRAYIDX2_EPIL_1]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[IV_UNR]], 2
; SIFIVE-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_1]], label %[[FOR_BODY_EPIL_2:.*]], label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[FOR_BODY_EPIL_2]]:
; SIFIVE-NEXT: [[TMP22:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_1]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP22]]
; SIFIVE-NEXT: [[TMP23:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_2]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_1]]
; SIFIVE-NEXT: store <4 x float> [[TMP23]], ptr [[ARRAYIDX2_EPIL_2]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL_2:%.*]] = add nuw nsw i64 [[IV_UNR]], 3
; SIFIVE-NEXT: [[EPIL_ITER_CMP_2:%.*]] = icmp ne i64 3, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_2]], label %[[FOR_BODY_EPIL_3:.*]], label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[FOR_BODY_EPIL_3]]:
; SIFIVE-NEXT: [[TMP24:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_2]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_3:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP24]]
; SIFIVE-NEXT: [[TMP25:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_3]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_3:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_2]]
; SIFIVE-NEXT: store <4 x float> [[TMP25]], ptr [[ARRAYIDX2_EPIL_3]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL_3:%.*]] = add nuw nsw i64 [[IV_UNR]], 4
; SIFIVE-NEXT: [[EPIL_ITER_CMP_3:%.*]] = icmp ne i64 4, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_3]], label %[[FOR_BODY_EPIL_4:.*]], label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[FOR_BODY_EPIL_4]]:
; SIFIVE-NEXT: [[TMP26:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_3]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_4:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP26]]
; SIFIVE-NEXT: [[TMP27:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_4]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_4:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_3]]
; SIFIVE-NEXT: store <4 x float> [[TMP27]], ptr [[ARRAYIDX2_EPIL_4]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL_4:%.*]] = add nuw nsw i64 [[IV_UNR]], 5
; SIFIVE-NEXT: [[EPIL_ITER_CMP_4:%.*]] = icmp ne i64 5, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_4]], label %[[FOR_BODY_EPIL_5:.*]], label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[FOR_BODY_EPIL_5]]:
; SIFIVE-NEXT: [[TMP28:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_4]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_5:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP28]]
; SIFIVE-NEXT: [[TMP29:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_5]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_5:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_4]]
; SIFIVE-NEXT: store <4 x float> [[TMP29]], ptr [[ARRAYIDX2_EPIL_5]], align 16
; SIFIVE-NEXT: [[IV_NEXT_EPIL_5:%.*]] = add nuw nsw i64 [[IV_UNR]], 6
; SIFIVE-NEXT: [[EPIL_ITER_CMP_5:%.*]] = icmp ne i64 6, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[EPIL_ITER_CMP_5]], label %[[FOR_BODY_EPIL_6:.*]], label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[FOR_BODY_EPIL_6]]:
; SIFIVE-NEXT: [[TMP30:%.*]] = sub nsw i64 [[LEN]], [[IV_NEXT_EPIL_5]]
; SIFIVE-NEXT: [[ARRAYIDX_EPIL_6:%.*]] = getelementptr inbounds <4 x float>, ptr [[SRC]], i64 [[TMP30]]
; SIFIVE-NEXT: [[TMP31:%.*]] = load <4 x float>, ptr [[ARRAYIDX_EPIL_6]], align 16
; SIFIVE-NEXT: [[ARRAYIDX2_EPIL_6:%.*]] = getelementptr inbounds nuw <4 x float>, ptr [[DST]], i64 [[IV_NEXT_EPIL_5]]
; SIFIVE-NEXT: store <4 x float> [[TMP31]], ptr [[ARRAYIDX2_EPIL_6]], align 16
; SIFIVE-NEXT: br label %[[EXIT_EPILOG_LCSSA]]
; SIFIVE: [[EXIT_EPILOG_LCSSA]]:
; SIFIVE-NEXT: br label %[[EXIT]]
; SIFIVE: [[EXIT]]:
; SIFIVE-NEXT: ret void
;
entry: ; preds = %entry
br label %for.body
for.body: ; preds = %entry, %for.body
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%1 = sub nsw i64 %len, %iv
%arrayidx = getelementptr inbounds <4 x float>, ptr %src, i64 %1
%2 = load <4 x float>, ptr %arrayidx, align 16
%arrayidx2 = getelementptr inbounds nuw <4 x float>, ptr %dst, i64 %iv
store <4 x float> %2, ptr %arrayidx2, align 16
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %len
br i1 %exitcond.not, label %exit, label %for.body
exit: ; preds = %for.body, %entry
ret void
}
define void @saxpy_tripcount8_full_unroll(ptr %dst, ptr %src, float %a) {
; COMMON-LABEL: define void @saxpy_tripcount8_full_unroll(
; COMMON-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; COMMON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[SRC]], align 4
; COMMON-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[DST]], align 4
; COMMON-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; COMMON-NEXT: store <4 x float> [[TMP0]], ptr [[DST]], align 4
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 4
; COMMON-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 4
; COMMON-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
; COMMON-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; COMMON-NEXT: store <4 x float> [[TMP3]], ptr [[TMP2]], align 4
; COMMON-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 8
br i1 %3, label %exit, label %vector.body
exit: ; preds = %vector.body
ret void
}
define void @saxpy_tripcount1K_av0(ptr %dst, ptr %src, float %a) {
; CHECK-LABEL: define void @saxpy_tripcount1K_av0(
; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; CHECK-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
; SIFIVE-LABEL: define void @saxpy_tripcount1K_av0(
; SIFIVE-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; SIFIVE-NEXT: [[ENTRY:.*]]:
; SIFIVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; SIFIVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; SIFIVE-NEXT: br label %[[VECTOR_BODY:.*]]
; SIFIVE: [[VECTOR_BODY]]:
; SIFIVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT_15:%.*]], %[[VECTOR_BODY]] ]
; SIFIVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; SIFIVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; SIFIVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; SIFIVE-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; SIFIVE-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; SIFIVE-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT1:%.*]] = add nuw nsw i64 [[INDEX]], 4
; SIFIVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT1]]
; SIFIVE-NEXT: [[WIDE_LOAD_1:%.*]] = load <4 x float>, ptr [[TMP12]], align 4
; SIFIVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT1]]
; SIFIVE-NEXT: [[WIDE_LOAD12_1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4
; SIFIVE-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_1]], <4 x float> [[WIDE_LOAD12_1]])
; SIFIVE-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_1:%.*]] = add nuw nsw i64 [[INDEX]], 8
; SIFIVE-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_1]]
; SIFIVE-NEXT: [[WIDE_LOAD_2:%.*]] = load <4 x float>, ptr [[TMP6]], align 4
; SIFIVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_1]]
; SIFIVE-NEXT: [[WIDE_LOAD12_2:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; SIFIVE-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_2]], <4 x float> [[WIDE_LOAD12_2]])
; SIFIVE-NEXT: store <4 x float> [[TMP8]], ptr [[TMP7]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_2:%.*]] = add nuw nsw i64 [[INDEX]], 12
; SIFIVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_2]]
; SIFIVE-NEXT: [[WIDE_LOAD_3:%.*]] = load <4 x float>, ptr [[TMP9]], align 4
; SIFIVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_2]]
; SIFIVE-NEXT: [[WIDE_LOAD12_3:%.*]] = load <4 x float>, ptr [[TMP10]], align 4
; SIFIVE-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_3]], <4 x float> [[WIDE_LOAD12_3]])
; SIFIVE-NEXT: store <4 x float> [[TMP11]], ptr [[TMP10]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT:%.*]] = add nuw nsw i64 [[INDEX]], 16
; SIFIVE-NEXT: [[TMP49:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT]]
; SIFIVE-NEXT: [[WIDE_LOAD_4:%.*]] = load <4 x float>, ptr [[TMP49]], align 4
; SIFIVE-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT]]
; SIFIVE-NEXT: [[WIDE_LOAD12_4:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
; SIFIVE-NEXT: [[TMP14:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_4]], <4 x float> [[WIDE_LOAD12_4]])
; SIFIVE-NEXT: store <4 x float> [[TMP14]], ptr [[TMP13]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_4:%.*]] = add nuw nsw i64 [[INDEX]], 20
; SIFIVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_4]]
; SIFIVE-NEXT: [[WIDE_LOAD_5:%.*]] = load <4 x float>, ptr [[TMP15]], align 4
; SIFIVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_4]]
; SIFIVE-NEXT: [[WIDE_LOAD12_5:%.*]] = load <4 x float>, ptr [[TMP16]], align 4
; SIFIVE-NEXT: [[TMP17:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_5]], <4 x float> [[WIDE_LOAD12_5]])
; SIFIVE-NEXT: store <4 x float> [[TMP17]], ptr [[TMP16]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_5:%.*]] = add nuw nsw i64 [[INDEX]], 24
; SIFIVE-NEXT: [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_5]]
; SIFIVE-NEXT: [[WIDE_LOAD_6:%.*]] = load <4 x float>, ptr [[TMP18]], align 4
; SIFIVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_5]]
; SIFIVE-NEXT: [[WIDE_LOAD12_6:%.*]] = load <4 x float>, ptr [[TMP19]], align 4
; SIFIVE-NEXT: [[TMP20:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_6]], <4 x float> [[WIDE_LOAD12_6]])
; SIFIVE-NEXT: store <4 x float> [[TMP20]], ptr [[TMP19]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_6:%.*]] = add nuw nsw i64 [[INDEX]], 28
; SIFIVE-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_6]]
; SIFIVE-NEXT: [[WIDE_LOAD_7:%.*]] = load <4 x float>, ptr [[TMP21]], align 4
; SIFIVE-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_6]]
; SIFIVE-NEXT: [[WIDE_LOAD12_7:%.*]] = load <4 x float>, ptr [[TMP22]], align 4
; SIFIVE-NEXT: [[TMP23:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_7]], <4 x float> [[WIDE_LOAD12_7]])
; SIFIVE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP22]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_7:%.*]] = add nuw nsw i64 [[INDEX]], 32
; SIFIVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_7]]
; SIFIVE-NEXT: [[WIDE_LOAD_8:%.*]] = load <4 x float>, ptr [[TMP24]], align 4
; SIFIVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_7]]
; SIFIVE-NEXT: [[WIDE_LOAD12_8:%.*]] = load <4 x float>, ptr [[TMP25]], align 4
; SIFIVE-NEXT: [[TMP26:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_8]], <4 x float> [[WIDE_LOAD12_8]])
; SIFIVE-NEXT: store <4 x float> [[TMP26]], ptr [[TMP25]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_8:%.*]] = add nuw nsw i64 [[INDEX]], 36
; SIFIVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_8]]
; SIFIVE-NEXT: [[WIDE_LOAD_9:%.*]] = load <4 x float>, ptr [[TMP27]], align 4
; SIFIVE-NEXT: [[TMP28:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_8]]
; SIFIVE-NEXT: [[WIDE_LOAD12_9:%.*]] = load <4 x float>, ptr [[TMP28]], align 4
; SIFIVE-NEXT: [[TMP29:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_9]], <4 x float> [[WIDE_LOAD12_9]])
; SIFIVE-NEXT: store <4 x float> [[TMP29]], ptr [[TMP28]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_9:%.*]] = add nuw nsw i64 [[INDEX]], 40
; SIFIVE-NEXT: [[TMP30:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_9]]
; SIFIVE-NEXT: [[WIDE_LOAD_10:%.*]] = load <4 x float>, ptr [[TMP30]], align 4
; SIFIVE-NEXT: [[TMP31:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_9]]
; SIFIVE-NEXT: [[WIDE_LOAD12_10:%.*]] = load <4 x float>, ptr [[TMP31]], align 4
; SIFIVE-NEXT: [[TMP32:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_10]], <4 x float> [[WIDE_LOAD12_10]])
; SIFIVE-NEXT: store <4 x float> [[TMP32]], ptr [[TMP31]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_10:%.*]] = add nuw nsw i64 [[INDEX]], 44
; SIFIVE-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_10]]
; SIFIVE-NEXT: [[WIDE_LOAD_11:%.*]] = load <4 x float>, ptr [[TMP33]], align 4
; SIFIVE-NEXT: [[TMP34:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_10]]
; SIFIVE-NEXT: [[WIDE_LOAD12_11:%.*]] = load <4 x float>, ptr [[TMP34]], align 4
; SIFIVE-NEXT: [[TMP35:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_11]], <4 x float> [[WIDE_LOAD12_11]])
; SIFIVE-NEXT: store <4 x float> [[TMP35]], ptr [[TMP34]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_11:%.*]] = add nuw nsw i64 [[INDEX]], 48
; SIFIVE-NEXT: [[TMP36:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_11]]
; SIFIVE-NEXT: [[WIDE_LOAD_12:%.*]] = load <4 x float>, ptr [[TMP36]], align 4
; SIFIVE-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_11]]
; SIFIVE-NEXT: [[WIDE_LOAD12_12:%.*]] = load <4 x float>, ptr [[TMP37]], align 4
; SIFIVE-NEXT: [[TMP38:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_12]], <4 x float> [[WIDE_LOAD12_12]])
; SIFIVE-NEXT: store <4 x float> [[TMP38]], ptr [[TMP37]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_12:%.*]] = add nuw nsw i64 [[INDEX]], 52
; SIFIVE-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_12]]
; SIFIVE-NEXT: [[WIDE_LOAD_13:%.*]] = load <4 x float>, ptr [[TMP39]], align 4
; SIFIVE-NEXT: [[TMP40:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_12]]
; SIFIVE-NEXT: [[WIDE_LOAD12_13:%.*]] = load <4 x float>, ptr [[TMP40]], align 4
; SIFIVE-NEXT: [[TMP41:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_13]], <4 x float> [[WIDE_LOAD12_13]])
; SIFIVE-NEXT: store <4 x float> [[TMP41]], ptr [[TMP40]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_13:%.*]] = add nuw nsw i64 [[INDEX]], 56
; SIFIVE-NEXT: [[TMP42:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_13]]
; SIFIVE-NEXT: [[WIDE_LOAD_14:%.*]] = load <4 x float>, ptr [[TMP42]], align 4
; SIFIVE-NEXT: [[TMP43:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_13]]
; SIFIVE-NEXT: [[WIDE_LOAD12_14:%.*]] = load <4 x float>, ptr [[TMP43]], align 4
; SIFIVE-NEXT: [[TMP44:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_14]], <4 x float> [[WIDE_LOAD12_14]])
; SIFIVE-NEXT: store <4 x float> [[TMP44]], ptr [[TMP43]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_14:%.*]] = add nuw nsw i64 [[INDEX]], 60
; SIFIVE-NEXT: [[TMP45:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX_NEXT_14]]
; SIFIVE-NEXT: [[WIDE_LOAD_15:%.*]] = load <4 x float>, ptr [[TMP45]], align 4
; SIFIVE-NEXT: [[TMP46:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX_NEXT_14]]
; SIFIVE-NEXT: [[WIDE_LOAD12_15:%.*]] = load <4 x float>, ptr [[TMP46]], align 4
; SIFIVE-NEXT: [[TMP47:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD_15]], <4 x float> [[WIDE_LOAD12_15]])
; SIFIVE-NEXT: store <4 x float> [[TMP47]], ptr [[TMP46]], align 4
; SIFIVE-NEXT: [[INDEX_NEXT_15]] = add nuw nsw i64 [[INDEX]], 64
; SIFIVE-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT_15]], 1024
; SIFIVE-NEXT: br i1 [[TMP48]], label %[[EXIT:.*]], label %[[VECTOR_BODY]]
; SIFIVE: [[EXIT]]:
; SIFIVE-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %exit, label %vector.body
exit: ; preds = %vector.body
ret void
}
define void @saxpy_tripcount1K_av1(ptr %dst, ptr %src, float %a) {
; COMMON-LABEL: define void @saxpy_tripcount1K_av1(
; COMMON-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], float [[A:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*]]:
; COMMON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A]], i64 0
; COMMON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[INDEX]]
; COMMON-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
; COMMON-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
; COMMON-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[BROADCAST_SPLAT]], <4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD12]])
; COMMON-NEXT: store <4 x float> [[TMP2]], ptr [[TMP1]], align 4
; COMMON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; COMMON-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; COMMON-NEXT: br i1 [[TMP3]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: ret void
;
entry:
%broadcast.splatinsert = insertelement <4 x float> poison, float %a, i64 0
%broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
%0 = getelementptr inbounds nuw float, ptr %src, i64 %index
%wide.load = load <4 x float>, ptr %0, align 4
%1 = getelementptr inbounds nuw float, ptr %dst, i64 %index
%wide.load12 = load <4 x float>, ptr %1, align 4
%2 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %broadcast.splat, <4 x float> %wide.load, <4 x float> %wide.load12)
store <4 x float> %2, ptr %1, align 4
%index.next = add nuw i64 %index, 4
%3 = icmp eq i64 %index.next, 1024
br i1 %3, label %exit, label %vector.body, !llvm.loop !0
exit: ; preds = %vector.body
ret void
}
; On SiFive we should runtime unroll the scalar epilogue loop, but not the
; vector loop.
define void @scalar_epilogue(ptr %p, i8 %splat.scalar, i64 %n) {
; CHECK-LABEL: define void @scalar_epilogue(
; CHECK-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[GEP_P_IV:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV]]
; CHECK-NEXT: [[GEP_P_IV_16:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_P_IV]], i64 16
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP_P_IV]], align 1
; CHECK-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i8>, ptr [[GEP_P_IV_16]], align 1
; CHECK-NEXT: [[ADD_BROADCAST:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[ADD_BROADCAST_2:%.*]] = add <16 x i8> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: store <16 x i8> [[ADD_BROADCAST]], ptr [[GEP_P_IV]], align 1
; CHECK-NEXT: store <16 x i8> [[ADD_BROADCAST_2]], ptr [[GEP_P_IV_16]], align 1
; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 32
; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; CHECK: [[SCALAR_REMAINDER_PREHEADER]]:
; CHECK-NEXT: [[IV_SCALAR_LOOP_PH:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label %[[SCALAR_REMAINDER:.*]]
; CHECK: [[SCALAR_REMAINDER]]:
; CHECK-NEXT: [[IV_SCALAR_LOOP:%.*]] = phi i64 [ [[INC:%.*]], %[[SCALAR_REMAINDER]] ], [ [[IV_SCALAR_LOOP_PH]], %[[SCALAR_REMAINDER_PREHEADER]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP]]
; CHECK-NEXT: [[SCALAR_LOAD:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[ADD:%.*]] = add i8 [[SCALAR_LOAD]], [[SPLAT_SCALAR]]
; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nuw i64 [[IV_SCALAR_LOOP]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: [[EXIT_LOOPEXIT]]:
; CHECK-NEXT: br label %[[EXIT]]
; CHECK: [[EXIT]]:
; CHECK-NEXT: ret void
;
; SIFIVE-LABEL: define void @scalar_epilogue(
; SIFIVE-SAME: ptr [[P:%.*]], i8 [[SPLAT_SCALAR:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; SIFIVE-NEXT: [[ENTRY:.*]]:
; SIFIVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
; SIFIVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_REMAINDER_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
; SIFIVE: [[VECTOR_PH]]:
; SIFIVE-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -32
; SIFIVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[SPLAT_SCALAR]], i64 0
; SIFIVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
; SIFIVE-NEXT: br label %[[VECTOR_BODY:.*]]
; SIFIVE: [[VECTOR_BODY]]:
; SIFIVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
; SIFIVE-NEXT: [[GEP_P_IV:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV]]
; SIFIVE-NEXT: [[GEP_P_IV_16:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_P_IV]], i64 16
; SIFIVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[GEP_P_IV]], align 1
; SIFIVE-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i8>, ptr [[GEP_P_IV_16]], align 1
; SIFIVE-NEXT: [[ADD_BROADCAST:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
; SIFIVE-NEXT: [[ADD_BROADCAST_2:%.*]] = add <16 x i8> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
; SIFIVE-NEXT: store <16 x i8> [[ADD_BROADCAST]], ptr [[GEP_P_IV]], align 1
; SIFIVE-NEXT: store <16 x i8> [[ADD_BROADCAST_2]], ptr [[GEP_P_IV_16]], align 1
; SIFIVE-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 32
; SIFIVE-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
; SIFIVE-NEXT: br i1 [[EXIT_COND]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; SIFIVE: [[MIDDLE_BLOCK]]:
; SIFIVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; SIFIVE-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER]]
; SIFIVE: [[SCALAR_REMAINDER_PREHEADER]]:
; SIFIVE-NEXT: [[IV_SCALAR_LOOP:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
; SIFIVE-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[IV_SCALAR_LOOP]]
; SIFIVE-NEXT: [[TMP1:%.*]] = add i64 [[N]], -1
; SIFIVE-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], [[IV_SCALAR_LOOP]]
; SIFIVE-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 7
; SIFIVE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; SIFIVE-NEXT: br i1 [[LCMP_MOD]], label %[[SCALAR_REMAINDER_PROL_PREHEADER:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT:.*]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_PREHEADER]]:
; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL:.*]]
; SIFIVE: [[SCALAR_REMAINDER_PROL]]:
; SIFIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP]]
; SIFIVE-NEXT: [[SCALAR_LOAD:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
; SIFIVE-NEXT: [[ADD:%.*]] = add i8 [[SCALAR_LOAD]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1
; SIFIVE-NEXT: [[INC:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 1
; SIFIVE-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 1, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP]], label %[[SCALAR_REMAINDER_PROL_1:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA:.*]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_1]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_1:%.*]] = load i8, ptr [[ARRAYIDX_PROL_1]], align 1
; SIFIVE-NEXT: [[ADD_PROL_1:%.*]] = add i8 [[SCALAR_LOAD_PROL_1]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_1]], ptr [[ARRAYIDX_PROL_1]], align 1
; SIFIVE-NEXT: [[INC_PROL_1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 2
; SIFIVE-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i64 2, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_1]], label %[[SCALAR_REMAINDER_PROL_2:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_2]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_1]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_2:%.*]] = load i8, ptr [[ARRAYIDX_PROL_2]], align 1
; SIFIVE-NEXT: [[ADD_PROL_2:%.*]] = add i8 [[SCALAR_LOAD_PROL_2]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_2]], ptr [[ARRAYIDX_PROL_2]], align 1
; SIFIVE-NEXT: [[INC_PROL_2:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 3
; SIFIVE-NEXT: [[PROL_ITER_CMP_2:%.*]] = icmp ne i64 3, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_2]], label %[[SCALAR_REMAINDER_PROL_3:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_3]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_2]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_3:%.*]] = load i8, ptr [[ARRAYIDX_PROL_3]], align 1
; SIFIVE-NEXT: [[ADD_PROL_3:%.*]] = add i8 [[SCALAR_LOAD_PROL_3]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_3]], ptr [[ARRAYIDX_PROL_3]], align 1
; SIFIVE-NEXT: [[INC_PROL_3:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 4
; SIFIVE-NEXT: [[PROL_ITER_CMP_3:%.*]] = icmp ne i64 4, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_3]], label %[[SCALAR_REMAINDER_PROL_4:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_4]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_3]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_4:%.*]] = load i8, ptr [[ARRAYIDX_PROL_4]], align 1
; SIFIVE-NEXT: [[ADD_PROL_4:%.*]] = add i8 [[SCALAR_LOAD_PROL_4]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_4]], ptr [[ARRAYIDX_PROL_4]], align 1
; SIFIVE-NEXT: [[INC_PROL_4:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 5
; SIFIVE-NEXT: [[PROL_ITER_CMP_4:%.*]] = icmp ne i64 5, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_4]], label %[[SCALAR_REMAINDER_PROL_5:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_5]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_4]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_5:%.*]] = load i8, ptr [[ARRAYIDX_PROL_5]], align 1
; SIFIVE-NEXT: [[ADD_PROL_5:%.*]] = add i8 [[SCALAR_LOAD_PROL_5]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_5]], ptr [[ARRAYIDX_PROL_5]], align 1
; SIFIVE-NEXT: [[INC_PROL_5:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 6
; SIFIVE-NEXT: [[PROL_ITER_CMP_5:%.*]] = icmp ne i64 6, [[XTRAITER]]
; SIFIVE-NEXT: br i1 [[PROL_ITER_CMP_5]], label %[[SCALAR_REMAINDER_PROL_6:.*]], label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_6]]:
; SIFIVE-NEXT: [[ARRAYIDX_PROL_6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_PROL_5]]
; SIFIVE-NEXT: [[SCALAR_LOAD_PROL_6:%.*]] = load i8, ptr [[ARRAYIDX_PROL_6]], align 1
; SIFIVE-NEXT: [[ADD_PROL_6:%.*]] = add i8 [[SCALAR_LOAD_PROL_6]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_PROL_6]], ptr [[ARRAYIDX_PROL_6]], align 1
; SIFIVE-NEXT: [[INC_PROL_6:%.*]] = add nuw i64 [[IV_SCALAR_LOOP]], 7
; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]]:
; SIFIVE-NEXT: [[IV_SCALAR_LOOP_UNR_PH:%.*]] = phi i64 [ [[INC]], %[[SCALAR_REMAINDER_PROL]] ], [ [[INC_PROL_1]], %[[SCALAR_REMAINDER_PROL_1]] ], [ [[INC_PROL_2]], %[[SCALAR_REMAINDER_PROL_2]] ], [ [[INC_PROL_3]], %[[SCALAR_REMAINDER_PROL_3]] ], [ [[INC_PROL_4]], %[[SCALAR_REMAINDER_PROL_4]] ], [ [[INC_PROL_5]], %[[SCALAR_REMAINDER_PROL_5]] ], [ [[INC_PROL_6]], %[[SCALAR_REMAINDER_PROL_6]] ]
; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER_PROL_LOOPEXIT]]
; SIFIVE: [[SCALAR_REMAINDER_PROL_LOOPEXIT]]:
; SIFIVE-NEXT: [[IV_SCALAR_LOOP_UNR:%.*]] = phi i64 [ [[IV_SCALAR_LOOP]], %[[SCALAR_REMAINDER_PREHEADER]] ], [ [[IV_SCALAR_LOOP_UNR_PH]], %[[SCALAR_REMAINDER_PROL_LOOPEXIT_UNR_LCSSA]] ]
; SIFIVE-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP2]], 7
; SIFIVE-NEXT: br i1 [[TMP3]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_REMAINDER_PREHEADER_NEW:.*]]
; SIFIVE: [[SCALAR_REMAINDER_PREHEADER_NEW]]:
; SIFIVE-NEXT: br label %[[SCALAR_REMAINDER:.*]]
; SIFIVE: [[SCALAR_REMAINDER]]:
; SIFIVE-NEXT: [[IV_SCALAR_LOOP1:%.*]] = phi i64 [ [[IV_SCALAR_LOOP_UNR]], %[[SCALAR_REMAINDER_PREHEADER_NEW]] ], [ [[INC_7:%.*]], %[[SCALAR_REMAINDER]] ]
; SIFIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[IV_SCALAR_LOOP1]]
; SIFIVE-NEXT: [[SCALAR_LOAD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
; SIFIVE-NEXT: [[ADD1:%.*]] = add i8 [[SCALAR_LOAD1]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD1]], ptr [[ARRAYIDX1]], align 1
; SIFIVE-NEXT: [[INC1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 1
; SIFIVE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC1]]
; SIFIVE-NEXT: [[SCALAR_LOAD_1:%.*]] = load i8, ptr [[ARRAYIDX_1]], align 1
; SIFIVE-NEXT: [[ADD_1:%.*]] = add i8 [[SCALAR_LOAD_1]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_1]], ptr [[ARRAYIDX_1]], align 1
; SIFIVE-NEXT: [[INC_1:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 2
; SIFIVE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_1]]
; SIFIVE-NEXT: [[SCALAR_LOAD_2:%.*]] = load i8, ptr [[ARRAYIDX_2]], align 1
; SIFIVE-NEXT: [[ADD_2:%.*]] = add i8 [[SCALAR_LOAD_2]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_2]], ptr [[ARRAYIDX_2]], align 1
; SIFIVE-NEXT: [[INC_2:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 3
; SIFIVE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_2]]
; SIFIVE-NEXT: [[SCALAR_LOAD_3:%.*]] = load i8, ptr [[ARRAYIDX_3]], align 1
; SIFIVE-NEXT: [[ADD_3:%.*]] = add i8 [[SCALAR_LOAD_3]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_3]], ptr [[ARRAYIDX_3]], align 1
; SIFIVE-NEXT: [[INC_3:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 4
; SIFIVE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_3]]
; SIFIVE-NEXT: [[SCALAR_LOAD_4:%.*]] = load i8, ptr [[ARRAYIDX_4]], align 1
; SIFIVE-NEXT: [[ADD_4:%.*]] = add i8 [[SCALAR_LOAD_4]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_4]], ptr [[ARRAYIDX_4]], align 1
; SIFIVE-NEXT: [[INC_4:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 5
; SIFIVE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_4]]
; SIFIVE-NEXT: [[SCALAR_LOAD_5:%.*]] = load i8, ptr [[ARRAYIDX_5]], align 1
; SIFIVE-NEXT: [[ADD_5:%.*]] = add i8 [[SCALAR_LOAD_5]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_5]], ptr [[ARRAYIDX_5]], align 1
; SIFIVE-NEXT: [[INC_5:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 6
; SIFIVE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_5]]
; SIFIVE-NEXT: [[SCALAR_LOAD_6:%.*]] = load i8, ptr [[ARRAYIDX_6]], align 1
; SIFIVE-NEXT: [[ADD_6:%.*]] = add i8 [[SCALAR_LOAD_6]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_6]], ptr [[ARRAYIDX_6]], align 1
; SIFIVE-NEXT: [[INC_6:%.*]] = add nuw i64 [[IV_SCALAR_LOOP1]], 7
; SIFIVE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[INC_6]]
; SIFIVE-NEXT: [[SCALAR_LOAD_7:%.*]] = load i8, ptr [[ARRAYIDX_7]], align 1
; SIFIVE-NEXT: [[ADD_7:%.*]] = add i8 [[SCALAR_LOAD_7]], [[SPLAT_SCALAR]]
; SIFIVE-NEXT: store i8 [[ADD_7]], ptr [[ARRAYIDX_7]], align 1
; SIFIVE-NEXT: [[INC_7]] = add nuw i64 [[IV_SCALAR_LOOP1]], 8
; SIFIVE-NEXT: [[EXITCOND_NOT_7:%.*]] = icmp eq i64 [[INC_7]], [[N]]
; SIFIVE-NEXT: br i1 [[EXITCOND_NOT_7]], label %[[EXIT_LOOPEXIT_UNR_LCSSA:.*]], label %[[SCALAR_REMAINDER]], !llvm.loop [[LOOP3:![0-9]+]]
; SIFIVE: [[EXIT_LOOPEXIT_UNR_LCSSA]]:
; SIFIVE-NEXT: br label %[[EXIT_LOOPEXIT]]
; SIFIVE: [[EXIT_LOOPEXIT]]:
; SIFIVE-NEXT: br label %[[EXIT]]
; SIFIVE: [[EXIT]]:
; SIFIVE-NEXT: ret void
;
entry:
%min.iters.check = icmp ult i64 %n, 32
br i1 %min.iters.check, label %scalar.remainder, label %vector.ph
vector.ph:
%n.vec = and i64 %n, -32
%broadcast.splatinsert = insertelement <16 x i8> poison, i8 %splat.scalar, i64 0
%broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> poison, <16 x i32> zeroinitializer
br label %vector.body
vector.body:
%iv = phi i64 [ 0, %vector.ph ], [ %iv.next, %vector.body ]
%gep.p.iv = getelementptr inbounds nuw i8, ptr %p, i64 %iv
%gep.p.iv.16 = getelementptr inbounds nuw i8, ptr %gep.p.iv, i64 16
%wide.load = load <16 x i8>, ptr %gep.p.iv, align 1
%wide.load.2 = load <16 x i8>, ptr %gep.p.iv.16, align 1
%add.broadcast = add <16 x i8> %wide.load, %broadcast.splat
%add.broadcast.2 = add <16 x i8> %wide.load.2, %broadcast.splat
store <16 x i8> %add.broadcast, ptr %gep.p.iv, align 1
store <16 x i8> %add.broadcast.2, ptr %gep.p.iv.16, align 1
%iv.next = add nuw i64 %iv, 32
%exit.cond = icmp eq i64 %iv.next, %n.vec
br i1 %exit.cond, label %middle.block, label %vector.body, !llvm.loop !2
middle.block:
%cmp.n = icmp eq i64 %n, %n.vec
br i1 %cmp.n, label %exit, label %scalar.remainder
scalar.remainder:
%iv.scalar.loop = phi i64 [ %inc, %scalar.remainder ], [ %n.vec, %middle.block ], [ 0, %entry ]
%arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 %iv.scalar.loop
%scalar.load = load i8, ptr %arrayidx, align 1
%add = add i8 %scalar.load, %splat.scalar
store i8 %add, ptr %arrayidx, align 1
%inc = add nuw i64 %iv.scalar.loop, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %exit, label %scalar.remainder, !llvm.loop !3
exit:
ret void
}
define void @vector_operands(ptr %p, i64 %n) {
; COMMON-LABEL: define void @vector_operands(
; COMMON-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*]]:
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
; COMMON: [[VECTOR_BODY]]:
; COMMON-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; COMMON-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[ENTRY]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ]
; COMMON-NEXT: [[VL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
; COMMON-NEXT: [[ADDR:%.*]] = getelementptr i64, ptr [[P]], i64 [[EVL_BASED_IV]]
; COMMON-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[ADDR]], <vscale x 2 x i1> splat (i1 true), i32 [[VL]])
; COMMON-NEXT: [[VL_ZEXT:%.*]] = zext i32 [[VL]] to i64
; COMMON-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[VL_ZEXT]], [[EVL_BASED_IV]]
; COMMON-NEXT: [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[VL_ZEXT]]
; COMMON-NEXT: [[TMP0:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
; COMMON-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: ret void
;
entry:
br label %vector.body
vector.body:
%evl.based.iv = phi i64 [ 0, %entry ], [ %index.evl.next, %vector.body ]
%avl = phi i64 [ %n, %entry ], [ %avl.next, %vector.body ]
%vl = call i32 @llvm.experimental.get.vector.length.i64(i64 %avl, i32 2, i1 true)
%addr = getelementptr i64, ptr %p, i64 %evl.based.iv
call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> splat (i64 0), ptr align 8 %addr, <vscale x 2 x i1> splat (i1 true), i32 %vl)
%vl.zext = zext i32 %vl to i64
%index.evl.next = add nuw i64 %vl.zext, %evl.based.iv
%avl.next = sub nuw i64 %avl, %vl.zext
%3 = icmp eq i64 %avl.next, 0
br i1 %3, label %exit, label %vector.body, !llvm.loop !2
exit:
ret void
}
!0 = !{!0, !1}
!1 = !{!"llvm.loop.isvectorized", i32 1}
!2 = distinct !{!2, !1}
!3 = distinct !{!3, !1}
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.
; SIFIVE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; SIFIVE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; SIFIVE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; SIFIVE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.