Florian Hahn 2dcf858ba0
[LAA] Use SCEVPtrToAddr in tryToCreateDiffChecks. (#178861)
The checks created by LAA only compute a pointer difference and do not
need to capture provenance. Use SCEVPtrToAddr instead of SCEVPtrToInt
for computations.

To avoid regressions while parts of SCEV are migrated to use PtrToAddr
this adds logic to rewrite all PtrToInt to PtrToAddr if possible in the
created expressions. This is needed to avoid regressions.

Similarly, if in the original IR we have a PtrToInt, SCEVExpander tries
to re-use it if possible when expanding PtrToAddr.

Depends on https://github.com/llvm/llvm-project/pull/178727.

Fixes https://github.com/llvm/llvm-project/issues/156978.

PR: https://github.com/llvm/llvm-project/pull/178861
2026-02-11 11:51:51 +00:00

270 lines
17 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
; Test VLA for reverse with fixed size vector
; This is the loop in c++ being vectorize in this file with
; shuffle reverse
; #pragma clang loop vectorize_width(8, fixed)
; for (int i = N-1; i >= 0; --i)
; a[i] = b[i] + 1.0;
; RUN: opt -passes=loop-vectorize,dce -mtriple aarch64-linux-gnu -S \
; RUN: -prefer-predicate-over-epilogue=scalar-epilogue < %s | FileCheck %s
define void @vector_reverse_f64(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-LABEL: define void @vector_reverse_f64(
; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64
; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-NEXT: br i1 [[CMP7]], label %[[FOR_BODY_PREHEADER:.*]], [[FOR_COND_CLEANUP:label %.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP3]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[TMP4]], i64 -7
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x double>, ptr [[TMP5]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x double> [[WIDE_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = fadd <8 x double> [[REVERSE]], splat (double 1.000000e+00)
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i64 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 -7
; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: store <8 x double> [[REVERSE3]], ptr [[TMP9]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
;
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
ret void
for.body: ; preds = %entry, %for.body
%i.08.in = phi i64 [ %i.08, %for.body ], [ %N, %entry ]
%i.08 = add nsw i64 %i.08.in, -1
%arrayidx = getelementptr inbounds double, ptr %b, i64 %i.08
%0 = load double, ptr %arrayidx, align 8
%add = fadd double %0, 1.000000e+00
%arrayidx1 = getelementptr inbounds double, ptr %a, i64 %i.08
store double %add, ptr %arrayidx1, align 8
%cmp = icmp sgt i64 %i.08.in, 1
br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
}
define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
; CHECK-LABEL: define void @vector_reverse_i64(
; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[A2:%.*]] = ptrtoaddr ptr [[A]] to i64
; CHECK-NEXT: [[B1:%.*]] = ptrtoaddr ptr [[B]] to i64
; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[N]], 0
; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], [[FOR_COND_CLEANUP:label %.*]]
; CHECK: [[FOR_BODY_PREHEADER]]:
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
; CHECK: [[VECTOR_MEMCHECK]]:
; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[B1]], [[A2]]
; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
; CHECK-NEXT: br i1 [[DIFF_CHECK]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[OFFSET_IDX]], -1
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i64 0
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i64 -7
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i64> [[REVERSE]], splat (i64 1)
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i64 -7
; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: store <8 x i64> [[REVERSE3]], ptr [[TMP9]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[FOR_COND_CLEANUP_LOOPEXIT:label %.*]], label %[[SCALAR_PH]]
; CHECK: [[SCALAR_PH]]:
;
entry:
%cmp8 = icmp sgt i64 %N, 0
br i1 %cmp8, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup, %entry
ret void
for.body: ; preds = %entry, %for.body
%i.09.in = phi i64 [ %i.09, %for.body ], [ %N, %entry ]
%i.09 = add nsw i64 %i.09.in, -1
%arrayidx = getelementptr inbounds i64, ptr %b, i64 %i.09
%0 = load i64, ptr %arrayidx, align 8
%add = add i64 %0, 1
%arrayidx2 = getelementptr inbounds i64, ptr %a, i64 %i.09
store i64 %add, ptr %arrayidx2, align 8
%cmp = icmp sgt i64 %i.09.in, 1
br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0
}
define i32 @reverse_store_with_partial_reduction(ptr noalias %dst, ptr noalias %src, i64 %n) #1 {
; CHECK-LABEL: define i32 @reverse_store_with_partial_reduction(
; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ITER_CHECK:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 5
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
; CHECK: [[VECTOR_PH]]:
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3
; CHECK-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP5]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[N]], [[INDEX]]
; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[SRC]], align 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP7]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = sext <vscale x 8 x i16> [[BROADCAST_SPLAT]] to <vscale x 8 x i32>
; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP8]])
; CHECK-NEXT: [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> [[VEC_PHI2]], <vscale x 8 x i32> [[TMP8]])
; CHECK-NEXT: [[PARTIAL_REDUCE6]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> [[VEC_PHI3]], <vscale x 8 x i32> [[TMP8]])
; CHECK-NEXT: [[PARTIAL_REDUCE7]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv8i32(<vscale x 4 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP8]])
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP4]]
; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 -1, [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[TMP9]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[TMP13]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP15:%.*]] = mul i64 -1, [[TMP4]]
; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 -1, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[TMP9]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[TMP18]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 -2, [[TMP4]]
; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 -1, [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i16, ptr [[TMP9]], i64 [[TMP20]]
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i16, ptr [[TMP23]], i64 [[TMP22]]
; CHECK-NEXT: [[TMP25:%.*]] = mul i64 -3, [[TMP4]]
; CHECK-NEXT: [[TMP26:%.*]] = sub i64 [[TMP4]], 1
; CHECK-NEXT: [[TMP27:%.*]] = mul i64 -1, [[TMP26]]
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[TMP9]], i64 [[TMP25]]
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr i16, ptr [[TMP28]], i64 [[TMP27]]
; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 8 x i16> @llvm.vector.reverse.nxv8i16(<vscale x 8 x i16> [[BROADCAST_SPLAT]])
; CHECK-NEXT: store <vscale x 8 x i16> [[REVERSE]], ptr [[TMP14]], align 2
; CHECK-NEXT: store <vscale x 8 x i16> [[REVERSE]], ptr [[TMP19]], align 2
; CHECK-NEXT: store <vscale x 8 x i16> [[REVERSE]], ptr [[TMP24]], align 2
; CHECK-NEXT: store <vscale x 8 x i16> [[REVERSE]], ptr [[TMP29]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP30]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: [[MIDDLE_BLOCK]]:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
; CHECK-NEXT: [[BIN_RDX8:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE6]], [[BIN_RDX]]
; CHECK-NEXT: [[BIN_RDX9:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE7]], [[BIN_RDX8]]
; CHECK-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX9]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
; CHECK: [[VEC_EPILOG_PH]]:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP31]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP0]], 4
; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF10]]
; CHECK-NEXT: [[TMP32:%.*]] = sub i64 [[N]], [[N_VEC11]]
; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi <2 x i32> [ [[TMP33]], %[[VEC_EPILOG_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX12]]
; CHECK-NEXT: [[TMP34:%.*]] = load i16, ptr [[SRC]], align 2
; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP34]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT14]], <4 x i16> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP35:%.*]] = sext <4 x i16> [[BROADCAST_SPLAT15]] to <4 x i32>
; CHECK-NEXT: [[PARTIAL_REDUCE16]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v4i32(<2 x i32> [[VEC_PHI13]], <4 x i32> [[TMP35]])
; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i16, ptr [[DST]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP37:%.*]] = getelementptr i16, ptr [[TMP36]], i64 0
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i16, ptr [[TMP37]], i64 -3
; CHECK-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLAT15]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: store <4 x i16> [[REVERSE17]], ptr [[TMP38]], align 2
; CHECK-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX12]], 4
; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[TMP39]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
; CHECK-NEXT: [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE16]])
; CHECK-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC11]]
; CHECK-NEXT: br i1 [[CMP_N19]], [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
;
entry:
br label %loop
loop:
%iv = phi i64 [ %n, %entry ], [ %iv.next, %loop ]
%sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ]
%uniform.load = load i16, ptr %src, align 2
%ext = sext i16 %uniform.load to i32
%sum.next = add i32 %sum, %ext
%dst.gep = getelementptr i16, ptr %dst, i64 %iv
store i16 %uniform.load, ptr %dst.gep, align 2
%iv.next = add i64 %iv, -1
%cmp = icmp ugt i64 %iv, 0
br i1 %cmp, label %loop, label %exit
exit:
ret i32 %sum.next
}
attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
attributes #1 = { "target-cpu"="neoverse-v2" }
!0 = distinct !{!0, !1, !2, !3, !4}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.width", i32 8}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 false}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}