Sander de Smalen 3157758190
[LV] Handle partial sub-reductions with sub in middle block. (#178919)
Sub-reductions can be implemented in two ways:
(1) negate the operand in the vector loop (the default way).
(2) subtract the reduced value from the init value in the middle block.

Note that both ways keep the reduction itself as an 'add' reduction,
which is necessary because only llvm.vector.partial.reduce.add exists.

The ISD nodes for partial reductions don't support folding the
sub/negation into its operands because the following is not a valid
transformation:
```
     sub(0, mul(ext(a), ext(b)))
  -> mul(ext(a), ext(sub(0, b)))
```
It can therefore be better to choose option (2) such that the partial
reduction is always positive (starting at '0') and to do a final
subtract in the middle block.

For AArch64 there are no dot-product instructions that can
do a `partial.reduce.sub(acc, mul(ext(a), ext(b)))` operation.
I'm not sure if such instructions exist for other targets.
(If so then we may want to make this decision a target option)

This PR also increases the AArch64 cost of a partial sub-reduction
when this exists in an 'add-sub' reduction chain.

Fixes https://github.com/llvm/llvm-project/issues/178703
2026-02-10 11:00:32 +00:00

1393 lines
100 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve \
; RUN: -vectorizer-maximize-bandwidth -force-vector-width=8 -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-none-unknown-elf"
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP16]])
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP10]]
; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP11]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP12]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %sub, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%add = add nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%sub = sub i32 %add, %mul.ac
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP15:%.*]] = mul nsw <vscale x 16 x i32> [[TMP12]], [[TMP14]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP15]])
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <vscale x 16 x i32> [[TMP12]], [[TMP10]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP11]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %add.2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%add = add nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%add.2 = add i32 %add, %mul.ac
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP12]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %add, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %add, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%sub = sub nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%add = add i32 %sub, %mul.ac
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub i32 0, [[TMP15]]
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP16:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP14]], [[TMP16]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP17]])
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP14]], [[TMP11]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP12]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[TMP18:%.*]] = sub i32 0, [[TMP15]]
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP21]]
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %sub.2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%sub = sub nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%sub.2 = sub i32 %sub, %mul.ac
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP16]])
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP10]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP11]])
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP15]], [[TMP10]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]], <vscale x 16 x i32> [[TMP12]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP11]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %sub.2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%sub = add nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%add = add nsw i32 %sub, %mul.ac
%mul.bc = mul nsw i32 %b.ext, %c.ext
%sub.2 = add i32 %add, %mul.bc
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
; CHECK-NEON-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
; CHECK-NEON-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]]
; CHECK-NEON-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP17:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 16 x i32> [[TMP15]], [[TMP17]]
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw <vscale x 16 x i32> zeroinitializer, [[TMP18]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 16 x i32> [[TMP15]], [[TMP11]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP12]])
; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw <vscale x 16 x i32> [[TMP17]], [[TMP11]]
; CHECK-SVE-NEXT: [[TMP14:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP13]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]], <vscale x 16 x i32> [[TMP14]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP12]]
; CHECK-SVE-MAXBW-NEXT: [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE4]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %sub.2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%sub = sub nsw i32 %res, %mul.ab
%mul.ac = mul nsw i32 %a.ext, %c.ext
%add = add nsw i32 %sub, %mul.ac
%mul.bc = mul nsw i32 %b.ext, %c.ext
%sub.2 = sub i32 %add, %mul.bc
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_madd_extadd(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_madd_extadd(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw <vscale x 16 x i32> [[TMP11]], [[TMP13]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_madd_extadd(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %add2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %add2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%mul.ab = mul nsw i32 %a.ext, %b.ext
%add = add nsw i32 %res, %mul.ab
%add2 = add i32 %add, %c.ext
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_extadd_extadd(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
; CHECK-NEON-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_extadd_extadd(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP6]])
; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE2]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_extadd_extadd(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
; CHECK-SVE-MAXBW-NEXT: [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE2]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %add2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %add2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%add = add nsw i32 %res, %a.ext
%add2 = add i32 %add, %b.ext
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_extadd_madd(
; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-NEON-NEXT: entry:
; CHECK-NEON-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-NEON-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-NEON-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-NEON: vector.ph:
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-NEON: vector.body:
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEON-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
; CHECK-NEON-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
; CHECK-NEON-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
; CHECK-NEON-NEXT: [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
; CHECK-NEON-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEON-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-NEON: middle.block:
; CHECK-NEON-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEON: scalar.ph:
;
; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_extadd_madd(
; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-NEXT: entry:
; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE: vector.ph:
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE: vector.body:
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP11]])
; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP10]])
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-SVE: middle.block:
; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE: scalar.ph:
;
; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_extadd_madd(
; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; CHECK-SVE-MAXBW-NEXT: entry:
; CHECK-SVE-MAXBW-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
; CHECK-SVE-MAXBW-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK-SVE-MAXBW: vector.ph:
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SVE-MAXBW: vector.body:
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
; CHECK-SVE-MAXBW-NEXT: [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
; CHECK-SVE-MAXBW-NEXT: [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
; CHECK-SVE-MAXBW-NEXT: [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; CHECK-SVE-MAXBW-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK-SVE-MAXBW: middle.block:
; CHECK-SVE-MAXBW-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-SVE-MAXBW: scalar.ph:
;
entry:
%cmp28.not = icmp ult i32 %N, 2
%div27 = lshr i32 %N, 1
%wide.trip.count = zext nneg i32 %div27 to i64
br label %for.body
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
%res.0.lcssa = phi i32 [ %add2, %for.body ]
ret i32 %res.0.lcssa
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%res = phi i32 [ 0, %entry ], [ %add2, %for.body ]
%a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
%b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
%c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
%a.val = load i8, ptr %a.ptr, align 1
%b.val = load i8, ptr %b.ptr, align 1
%c.val = load i8, ptr %c.ptr, align 1
%a.ext = sext i8 %a.val to i32
%b.ext = sext i8 %b.val to i32
%c.ext = sext i8 %c.val to i32
%add = add nsw i32 %res, %c.ext
%mul.ab = mul nsw i32 %a.ext, %b.ext
%add2 = add i32 %add, %mul.ab
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
}
attributes #0 = { vscale_range(1,16) }
!0 = distinct !{!0, !1, !2}
!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!2 = !{!"llvm.loop.interleave.count", i32 1}