Nikita Popov eecb99c5f6 [Tests] Add disjoint flag to some tests (NFC)
These tests rely on SCEV looking recognizing an "or" with no common
bits as an "add". Add the disjoint flag to relevant or instructions
in preparation for switching SCEV to use the flag instead of the
ValueTracking query. The IR with disjoint flag matches what
InstCombine would produce.
2023-12-05 14:09:36 +01:00

497 lines
19 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
; @simple is the most basic chain of address induction variables. Chaining
; saves at least one register and avoids complex addressing and setup
; code.
;
; no expensive address computation in the preheader
; no complex address modes
define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind {
; A9-LABEL: simple:
; A9: @ %bb.0: @ %entry
; A9-NEXT: .save {r4, r5, r6, lr}
; A9-NEXT: push {r4, r5, r6, lr}
; A9-NEXT: mov r3, r0
; A9-NEXT: lsls r2, r2, #2
; A9-NEXT: movs r0, #0
; A9-NEXT: .LBB0_1: @ %loop
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: add.w lr, r3, r2
; A9-NEXT: ldr.w r12, [r3, r2]
; A9-NEXT: ldr r3, [r3]
; A9-NEXT: add.w r4, lr, r2
; A9-NEXT: ldr.w r6, [lr, r2]
; A9-NEXT: add r0, r3
; A9-NEXT: adds r3, r4, r2
; A9-NEXT: add r0, r12
; A9-NEXT: ldr r5, [r4, r2]
; A9-NEXT: add r0, r6
; A9-NEXT: add r3, r2
; A9-NEXT: add r0, r5
; A9-NEXT: cmp r3, r1
; A9-NEXT: bne .LBB0_1
; A9-NEXT: @ %bb.2: @ %exit
; A9-NEXT: pop {r4, r5, r6, pc}
entry:
br label %loop
loop:
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
%v = load i32, ptr %iv
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
%v1 = load i32, ptr %iv1
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
%v2 = load i32, ptr %iv2
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
%v3 = load i32, ptr %iv3
%s1 = add i32 %s, %v
%s2 = add i32 %s1, %v1
%s3 = add i32 %s2, %v2
%s4 = add i32 %s3, %v3
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
%cmp = icmp eq ptr %iv4, %b
br i1 %cmp, label %exit, label %loop
exit:
ret i32 %s4
}
; @user is not currently chained because the IV is live across memory ops.
;
; stride multiples computed in the preheader
; complex address modes
define i32 @user(ptr %a, ptr %b, i32 %x) nounwind {
; A9-LABEL: user:
; A9: @ %bb.0: @ %entry
; A9-NEXT: .save {r4, r5, r6, r7, lr}
; A9-NEXT: push {r4, r5, r6, r7, lr}
; A9-NEXT: add.w r3, r2, r2, lsl #1
; A9-NEXT: lsl.w r12, r2, #4
; A9-NEXT: lsl.w lr, r3, #2
; A9-NEXT: movs r3, #0
; A9-NEXT: .LBB1_1: @ %loop
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: ldr r4, [r0]
; A9-NEXT: ldr.w r5, [r0, r2, lsl #3]
; A9-NEXT: ldr.w r6, [r0, r2, lsl #2]
; A9-NEXT: add r3, r4
; A9-NEXT: ldr.w r7, [r0, lr]
; A9-NEXT: add r3, r6
; A9-NEXT: add r3, r5
; A9-NEXT: add r3, r7
; A9-NEXT: str r3, [r0]
; A9-NEXT: add r0, r12
; A9-NEXT: cmp r0, r1
; A9-NEXT: bne .LBB1_1
; A9-NEXT: @ %bb.2: @ %exit
; A9-NEXT: mov r0, r3
; A9-NEXT: pop {r4, r5, r6, r7, pc}
entry:
br label %loop
loop:
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
%v = load i32, ptr %iv
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
%v1 = load i32, ptr %iv1
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
%v2 = load i32, ptr %iv2
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
%v3 = load i32, ptr %iv3
%s1 = add i32 %s, %v
%s2 = add i32 %s1, %v1
%s3 = add i32 %s2, %v2
%s4 = add i32 %s3, %v3
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
store i32 %s4, ptr %iv
%cmp = icmp eq ptr %iv4, %b
br i1 %cmp, label %exit, label %loop
exit:
ret i32 %s4
}
; @extrastride is a slightly more interesting case of a single
; complete chain with multiple strides. The test case IR is what LSR
; used to do, and exactly what we don't want to do. LSR's new IV
; chaining feature should now undo the damage.
;
; no spills
; only one stride multiple in the preheader
; no complex address modes or reloads
define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
; A9-LABEL: extrastride:
; A9: @ %bb.0: @ %entry
; A9-NEXT: .save {r4, r5, r6, r7, lr}
; A9-NEXT: push {r4, r5, r6, r7, lr}
; A9-NEXT: ldr.w r12, [sp, #24]
; A9-NEXT: cmp.w r12, #0
; A9-NEXT: beq .LBB2_3
; A9-NEXT: @ %bb.1: @ %for.body.lr.ph
; A9-NEXT: ldr r4, [sp, #20]
; A9-NEXT: add.w lr, r3, r1
; A9-NEXT: lsls r3, r4, #2
; A9-NEXT: .LBB2_2: @ %for.body
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: adds r5, r0, r1
; A9-NEXT: ldr r4, [r0, r1]
; A9-NEXT: ldr r0, [r0]
; A9-NEXT: subs.w r12, r12, #1
; A9-NEXT: ldr r6, [r5, r1]
; A9-NEXT: add r5, r1
; A9-NEXT: add r0, r4
; A9-NEXT: ldr r7, [r5, r1]
; A9-NEXT: add r5, r1
; A9-NEXT: add r0, r6
; A9-NEXT: ldr r4, [r5, r1]
; A9-NEXT: add r0, r7
; A9-NEXT: add r0, r4
; A9-NEXT: str r0, [r2]
; A9-NEXT: add.w r0, r5, r1
; A9-NEXT: add r2, r3
; A9-NEXT: add r0, lr
; A9-NEXT: bne .LBB2_2
; A9-NEXT: .LBB2_3: @ %for.end
; A9-NEXT: pop {r4, r5, r6, r7, pc}
entry:
%cmp8 = icmp eq i32 %z, 0
br i1 %cmp8, label %for.end, label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%add.ptr.sum = shl i32 %main_stride, 1 ; s*2
%add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3
%add.ptr2.sum = add i32 %x, %main_stride ; s + x
%add.ptr4.sum = shl i32 %main_stride, 2 ; s*4
%add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%main.addr.011 = phi ptr [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
%i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%res.addr.09 = phi ptr [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ]
%0 = load i32, ptr %main.addr.011, align 4
%add.ptr = getelementptr inbounds i8, ptr %main.addr.011, i32 %main_stride
%1 = load i32, ptr %add.ptr, align 4
%add.ptr1 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr.sum
%2 = load i32, ptr %add.ptr1, align 4
%add.ptr2 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr1.sum
%3 = load i32, ptr %add.ptr2, align 4
%add.ptr3 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr4.sum
%4 = load i32, ptr %add.ptr3, align 4
%add = add i32 %1, %0
%add4 = add i32 %add, %2
%add5 = add i32 %add4, %3
%add6 = add i32 %add5, %4
store i32 %add6, ptr %res.addr.09, align 4
%add.ptr6 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr3.sum
%add.ptr7 = getelementptr inbounds i32, ptr %res.addr.09, i32 %y
%inc = add i32 %i.010, 1
%cmp = icmp eq i32 %inc, %z
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
; @foldedidx is an unrolled variant of this loop:
; for (unsigned long i = 0; i < len; i += s) {
; c[i] = a[i] + b[i];
; }
; where 's' can be folded into the addressing mode.
; Consequently, we should *not* form any chains.
define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nounwind ssp {
; A9-LABEL: foldedidx:
; A9: @ %bb.0: @ %entry
; A9-NEXT: .save {r4, r5, r6, lr}
; A9-NEXT: push {r4, r5, r6, lr}
; A9-NEXT: mov.w lr, #0
; A9-NEXT: .LBB3_1: @ %for.body
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: ldrb.w r12, [r0, lr]
; A9-NEXT: add.w r4, r1, lr
; A9-NEXT: ldrb.w r3, [r1, lr]
; A9-NEXT: add r3, r12
; A9-NEXT: strb.w r3, [r2, lr]
; A9-NEXT: add.w r3, r0, lr
; A9-NEXT: ldrb.w r12, [r3, #1]
; A9-NEXT: ldrb r5, [r4, #1]
; A9-NEXT: add r12, r5
; A9-NEXT: add.w r5, r2, lr
; A9-NEXT: strb.w r12, [r5, #1]
; A9-NEXT: add.w lr, lr, #4
; A9-NEXT: cmp.w lr, #400
; A9-NEXT: ldrb.w r12, [r3, #2]
; A9-NEXT: ldrb r6, [r4, #2]
; A9-NEXT: add r6, r12
; A9-NEXT: strb r6, [r5, #2]
; A9-NEXT: ldrb r3, [r3, #3]
; A9-NEXT: ldrb r6, [r4, #3]
; A9-NEXT: add r3, r6
; A9-NEXT: strb r3, [r5, #3]
; A9-NEXT: bne .LBB3_1
; A9-NEXT: @ %bb.2: @ %for.end
; A9-NEXT: pop {r4, r5, r6, pc}
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.07
%0 = load i8, ptr %arrayidx, align 1
%conv5 = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.07
%1 = load i8, ptr %arrayidx1, align 1
%conv26 = zext i8 %1 to i32
%add = add nsw i32 %conv26, %conv5
%conv3 = trunc i32 %add to i8
%arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.07
store i8 %conv3, ptr %arrayidx4, align 1
%inc1 = or disjoint i32 %i.07, 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i32 %inc1
%2 = load i8, ptr %arrayidx.1, align 1
%conv5.1 = zext i8 %2 to i32
%arrayidx1.1 = getelementptr inbounds i8, ptr %b, i32 %inc1
%3 = load i8, ptr %arrayidx1.1, align 1
%conv26.1 = zext i8 %3 to i32
%add.1 = add nsw i32 %conv26.1, %conv5.1
%conv3.1 = trunc i32 %add.1 to i8
%arrayidx4.1 = getelementptr inbounds i8, ptr %c, i32 %inc1
store i8 %conv3.1, ptr %arrayidx4.1, align 1
%inc.12 = or disjoint i32 %i.07, 2
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i32 %inc.12
%4 = load i8, ptr %arrayidx.2, align 1
%conv5.2 = zext i8 %4 to i32
%arrayidx1.2 = getelementptr inbounds i8, ptr %b, i32 %inc.12
%5 = load i8, ptr %arrayidx1.2, align 1
%conv26.2 = zext i8 %5 to i32
%add.2 = add nsw i32 %conv26.2, %conv5.2
%conv3.2 = trunc i32 %add.2 to i8
%arrayidx4.2 = getelementptr inbounds i8, ptr %c, i32 %inc.12
store i8 %conv3.2, ptr %arrayidx4.2, align 1
%inc.23 = or disjoint i32 %i.07, 3
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i32 %inc.23
%6 = load i8, ptr %arrayidx.3, align 1
%conv5.3 = zext i8 %6 to i32
%arrayidx1.3 = getelementptr inbounds i8, ptr %b, i32 %inc.23
%7 = load i8, ptr %arrayidx1.3, align 1
%conv26.3 = zext i8 %7 to i32
%add.3 = add nsw i32 %conv26.3, %conv5.3
%conv3.3 = trunc i32 %add.3 to i8
%arrayidx4.3 = getelementptr inbounds i8, ptr %c, i32 %inc.23
store i8 %conv3.3, ptr %arrayidx4.3, align 1
%inc.3 = add nsw i32 %i.07, 4
%exitcond.3 = icmp eq i32 %inc.3, 400
br i1 %exitcond.3, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
; @testNeon is an important example of the nead for ivchains.
;
; Loads and stores should use post-increment addressing, no add's or add.w's.
; Most importantly, there should be no spills or reloads!
define hidden void @testNeon(ptr %ref_data, i32 %ref_stride, i32 %limit, ptr nocapture %data) nounwind optsize {
; A9-LABEL: testNeon:
; A9: @ %bb.0:
; A9-NEXT: .save {r4, r5, r7, lr}
; A9-NEXT: push {r4, r5, r7, lr}
; A9-NEXT: vmov.i32 q8, #0x0
; A9-NEXT: cmp r2, #1
; A9-NEXT: blt .LBB4_4
; A9-NEXT: @ %bb.1: @ %.lr.ph
; A9-NEXT: movs r5, #0
; A9-NEXT: movw r4, #64464
; A9-NEXT: sub.w r12, r5, r2, lsl #6
; A9-NEXT: sub.w lr, r1, r1, lsl #4
; A9-NEXT: movt r4, #65535
; A9-NEXT: mov r5, r3
; A9-NEXT: .LBB4_2: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: vld1.64 {d18}, [r0], r1
; A9-NEXT: subs r2, #1
; A9-NEXT: vld1.64 {d19}, [r0], r1
; A9-NEXT: vst1.8 {d18, d19}, [r5]!
; A9-NEXT: vld1.64 {d20}, [r0], r1
; A9-NEXT: vld1.64 {d21}, [r0], r1
; A9-NEXT: vst1.8 {d20, d21}, [r5]!
; A9-NEXT: vld1.64 {d22}, [r0], r1
; A9-NEXT: vadd.i8 q9, q9, q10
; A9-NEXT: vld1.64 {d23}, [r0], r1
; A9-NEXT: vst1.8 {d22, d23}, [r5]!
; A9-NEXT: vld1.64 {d20}, [r0], r1
; A9-NEXT: vadd.i8 q9, q9, q11
; A9-NEXT: vld1.64 {d21}, [r0], lr
; A9-NEXT: vadd.i8 q9, q9, q10
; A9-NEXT: vadd.i8 q8, q8, q9
; A9-NEXT: vst1.8 {d20, d21}, [r5], r4
; A9-NEXT: bne .LBB4_2
; A9-NEXT: @ %bb.3: @ %._crit_edge
; A9-NEXT: add.w r3, r3, r12, lsl #4
; A9-NEXT: .LBB4_4:
; A9-NEXT: vst1.32 {d16, d17}, [r3]
; A9-NEXT: pop {r4, r5, r7, pc}
%1 = icmp sgt i32 %limit, 0
br i1 %1, label %.lr.ph, label %45
.lr.ph: ; preds = %0
%2 = shl nsw i32 %ref_stride, 1
%3 = mul nsw i32 %ref_stride, 3
%4 = shl nsw i32 %ref_stride, 2
%5 = mul nsw i32 %ref_stride, 5
%6 = mul nsw i32 %ref_stride, 6
%7 = mul nsw i32 %ref_stride, 7
%8 = shl nsw i32 %ref_stride, 3
%9 = sub i32 0, %8
%10 = mul i32 %limit, -64
br label %11
; <label>:11 ; preds = %11, %.lr.ph
%.05 = phi ptr [ %ref_data, %.lr.ph ], [ %42, %11 ]
%counter.04 = phi i32 [ 0, %.lr.ph ], [ %44, %11 ]
%result.03 = phi <16 x i8> [ zeroinitializer, %.lr.ph ], [ %41, %11 ]
%.012 = phi ptr [ %data, %.lr.ph ], [ %43, %11 ]
%12 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %.05, i32 1) nounwind
%13 = getelementptr inbounds i8, ptr %.05, i32 %ref_stride
%14 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %13, i32 1) nounwind
%15 = shufflevector <1 x i64> %12, <1 x i64> %14, <2 x i32> <i32 0, i32 1>
%16 = bitcast <2 x i64> %15 to <16 x i8>
%17 = getelementptr inbounds <16 x i8>, ptr %.012, i32 1
store <16 x i8> %16, ptr %.012, align 4
%18 = getelementptr inbounds i8, ptr %.05, i32 %2
%19 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %18, i32 1) nounwind
%20 = getelementptr inbounds i8, ptr %.05, i32 %3
%21 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %20, i32 1) nounwind
%22 = shufflevector <1 x i64> %19, <1 x i64> %21, <2 x i32> <i32 0, i32 1>
%23 = bitcast <2 x i64> %22 to <16 x i8>
%24 = getelementptr inbounds <16 x i8>, ptr %.012, i32 2
store <16 x i8> %23, ptr %17, align 4
%25 = getelementptr inbounds i8, ptr %.05, i32 %4
%26 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %25, i32 1) nounwind
%27 = getelementptr inbounds i8, ptr %.05, i32 %5
%28 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %27, i32 1) nounwind
%29 = shufflevector <1 x i64> %26, <1 x i64> %28, <2 x i32> <i32 0, i32 1>
%30 = bitcast <2 x i64> %29 to <16 x i8>
%31 = getelementptr inbounds <16 x i8>, ptr %.012, i32 3
store <16 x i8> %30, ptr %24, align 4
%32 = getelementptr inbounds i8, ptr %.05, i32 %6
%33 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %32, i32 1) nounwind
%34 = getelementptr inbounds i8, ptr %.05, i32 %7
%35 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %34, i32 1) nounwind
%36 = shufflevector <1 x i64> %33, <1 x i64> %35, <2 x i32> <i32 0, i32 1>
%37 = bitcast <2 x i64> %36 to <16 x i8>
store <16 x i8> %37, ptr %31, align 4
%38 = add <16 x i8> %16, %23
%39 = add <16 x i8> %38, %30
%40 = add <16 x i8> %39, %37
%41 = add <16 x i8> %result.03, %40
%42 = getelementptr i8, ptr %.05, i32 %9
%43 = getelementptr inbounds <16 x i8>, ptr %.012, i32 -64
%44 = add nsw i32 %counter.04, 1
%exitcond = icmp eq i32 %44, %limit
br i1 %exitcond, label %._crit_edge, label %11
._crit_edge: ; preds = %11
%scevgep = getelementptr <16 x i8>, ptr %data, i32 %10
br label %45
; <label>:45 ; preds = %._crit_edge, %0
%result.0.lcssa = phi <16 x i8> [ %41, %._crit_edge ], [ zeroinitializer, %0 ]
%.01.lcssa = phi ptr [ %scevgep, %._crit_edge ], [ %data, %0 ]
store <16 x i8> %result.0.lcssa, ptr %.01.lcssa, align 4
ret void
}
declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr, i32) nounwind readonly
; Handle chains in which the same offset is used for both loads and
; stores to the same array.
; rdar://11410078.
define void @testReuse(ptr %src, i32 %stride) nounwind ssp {
; A9-LABEL: testReuse:
; A9: @ %bb.0: @ %entry
; A9-NEXT: sub.w r12, r0, r1, lsl #2
; A9-NEXT: sub.w r0, r1, r1, lsl #2
; A9-NEXT: lsls r2, r0, #1
; A9-NEXT: movs r3, #0
; A9-NEXT: .LBB5_1: @ %for.body
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
; A9-NEXT: add.w r0, r12, r3
; A9-NEXT: adds r3, #8
; A9-NEXT: vld1.8 {d16}, [r0], r1
; A9-NEXT: cmp r3, #32
; A9-NEXT: vld1.8 {d17}, [r0], r1
; A9-NEXT: vhadd.u8 d16, d16, d17
; A9-NEXT: vld1.8 {d18}, [r0], r1
; A9-NEXT: vhadd.u8 d17, d17, d18
; A9-NEXT: vld1.8 {d19}, [r0], r1
; A9-NEXT: vhadd.u8 d18, d18, d19
; A9-NEXT: vld1.8 {d20}, [r0], r1
; A9-NEXT: vhadd.u8 d19, d19, d20
; A9-NEXT: vld1.8 {d21}, [r0], r1
; A9-NEXT: vhadd.u8 d20, d20, d21
; A9-NEXT: vld1.8 {d22}, [r0], r1
; A9-NEXT: vhadd.u8 d21, d21, d22
; A9-NEXT: vld1.8 {d23}, [r0], r2
; A9-NEXT: vst1.8 {d16}, [r0], r1
; A9-NEXT: vst1.8 {d17}, [r0], r1
; A9-NEXT: vst1.8 {d18}, [r0], r1
; A9-NEXT: vst1.8 {d19}, [r0], r1
; A9-NEXT: vst1.8 {d20}, [r0], r1
; A9-NEXT: vst1.8 {d21}, [r0]
; A9-NEXT: bne .LBB5_1
; A9-NEXT: @ %bb.2: @ %for.end
; A9-NEXT: bx lr
entry:
%mul = shl nsw i32 %stride, 2
%idx.neg = sub i32 0, %mul
%mul1 = mul nsw i32 %stride, 3
%idx.neg2 = sub i32 0, %mul1
%mul5 = shl nsw i32 %stride, 1
%idx.neg6 = sub i32 0, %mul5
%idx.neg10 = sub i32 0, %stride
br label %for.body
for.body: ; preds = %for.body, %entry
%i.0110 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%src.addr = phi ptr [ %src, %entry ], [ %add.ptr45, %for.body ]
%add.ptr = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg
%vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr, i32 1)
%add.ptr3 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg2
%vld2 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr3, i32 1)
%add.ptr7 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg6
%vld3 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr7, i32 1)
%add.ptr11 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg10
%vld4 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr11, i32 1)
%vld5 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %src.addr, i32 1)
%add.ptr17 = getelementptr inbounds i8, ptr %src.addr, i32 %stride
%vld6 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr17, i32 1)
%add.ptr20 = getelementptr inbounds i8, ptr %src.addr, i32 %mul5
%vld7 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr20, i32 1)
%add.ptr23 = getelementptr inbounds i8, ptr %src.addr, i32 %mul1
%vld8 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr23, i32 1)
%vadd1 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld1, <8 x i8> %vld2) nounwind
%vadd2 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld2, <8 x i8> %vld3) nounwind
%vadd3 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld3, <8 x i8> %vld4) nounwind
%vadd4 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld4, <8 x i8> %vld5) nounwind
%vadd5 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld5, <8 x i8> %vld6) nounwind
%vadd6 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld6, <8 x i8> %vld7) nounwind
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr3, <8 x i8> %vadd1, i32 1)
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr7, <8 x i8> %vadd2, i32 1)
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr11, <8 x i8> %vadd3, i32 1)
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %src.addr, <8 x i8> %vadd4, i32 1)
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr17, <8 x i8> %vadd5, i32 1)
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr20, <8 x i8> %vadd6, i32 1)
%inc = add nsw i32 %i.0110, 1
%add.ptr45 = getelementptr inbounds i8, ptr %src.addr, i32 8
%exitcond = icmp eq i32 %inc, 4
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr, i32) nounwind readonly
declare void @llvm.arm.neon.vst1.p0.v8i8(ptr, <8 x i8>, i32) nounwind
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone