
These tests rely on SCEV looking recognizing an "or" with no common bits as an "add". Add the disjoint flag to relevant or instructions in preparation for switching SCEV to use the flag instead of the ValueTracking query. The IR with disjoint flag matches what InstCombine would produce.
497 lines
19 KiB
LLVM
497 lines
19 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
|
|
; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
|
|
|
|
; @simple is the most basic chain of address induction variables. Chaining
|
|
; saves at least one register and avoids complex addressing and setup
|
|
; code.
|
|
;
|
|
; no expensive address computation in the preheader
|
|
; no complex address modes
|
|
define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind {
|
|
; A9-LABEL: simple:
|
|
; A9: @ %bb.0: @ %entry
|
|
; A9-NEXT: .save {r4, r5, r6, lr}
|
|
; A9-NEXT: push {r4, r5, r6, lr}
|
|
; A9-NEXT: mov r3, r0
|
|
; A9-NEXT: lsls r2, r2, #2
|
|
; A9-NEXT: movs r0, #0
|
|
; A9-NEXT: .LBB0_1: @ %loop
|
|
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: add.w lr, r3, r2
|
|
; A9-NEXT: ldr.w r12, [r3, r2]
|
|
; A9-NEXT: ldr r3, [r3]
|
|
; A9-NEXT: add.w r4, lr, r2
|
|
; A9-NEXT: ldr.w r6, [lr, r2]
|
|
; A9-NEXT: add r0, r3
|
|
; A9-NEXT: adds r3, r4, r2
|
|
; A9-NEXT: add r0, r12
|
|
; A9-NEXT: ldr r5, [r4, r2]
|
|
; A9-NEXT: add r0, r6
|
|
; A9-NEXT: add r3, r2
|
|
; A9-NEXT: add r0, r5
|
|
; A9-NEXT: cmp r3, r1
|
|
; A9-NEXT: bne .LBB0_1
|
|
; A9-NEXT: @ %bb.2: @ %exit
|
|
; A9-NEXT: pop {r4, r5, r6, pc}
|
|
entry:
|
|
br label %loop
|
|
loop:
|
|
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
|
|
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
|
|
%v = load i32, ptr %iv
|
|
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
|
|
%v1 = load i32, ptr %iv1
|
|
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
|
|
%v2 = load i32, ptr %iv2
|
|
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
|
|
%v3 = load i32, ptr %iv3
|
|
%s1 = add i32 %s, %v
|
|
%s2 = add i32 %s1, %v1
|
|
%s3 = add i32 %s2, %v2
|
|
%s4 = add i32 %s3, %v3
|
|
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
|
|
%cmp = icmp eq ptr %iv4, %b
|
|
br i1 %cmp, label %exit, label %loop
|
|
exit:
|
|
ret i32 %s4
|
|
}
|
|
|
|
; @user is not currently chained because the IV is live across memory ops.
|
|
;
|
|
; stride multiples computed in the preheader
|
|
; complex address modes
|
|
define i32 @user(ptr %a, ptr %b, i32 %x) nounwind {
|
|
; A9-LABEL: user:
|
|
; A9: @ %bb.0: @ %entry
|
|
; A9-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; A9-NEXT: push {r4, r5, r6, r7, lr}
|
|
; A9-NEXT: add.w r3, r2, r2, lsl #1
|
|
; A9-NEXT: lsl.w r12, r2, #4
|
|
; A9-NEXT: lsl.w lr, r3, #2
|
|
; A9-NEXT: movs r3, #0
|
|
; A9-NEXT: .LBB1_1: @ %loop
|
|
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: ldr r4, [r0]
|
|
; A9-NEXT: ldr.w r5, [r0, r2, lsl #3]
|
|
; A9-NEXT: ldr.w r6, [r0, r2, lsl #2]
|
|
; A9-NEXT: add r3, r4
|
|
; A9-NEXT: ldr.w r7, [r0, lr]
|
|
; A9-NEXT: add r3, r6
|
|
; A9-NEXT: add r3, r5
|
|
; A9-NEXT: add r3, r7
|
|
; A9-NEXT: str r3, [r0]
|
|
; A9-NEXT: add r0, r12
|
|
; A9-NEXT: cmp r0, r1
|
|
; A9-NEXT: bne .LBB1_1
|
|
; A9-NEXT: @ %bb.2: @ %exit
|
|
; A9-NEXT: mov r0, r3
|
|
; A9-NEXT: pop {r4, r5, r6, r7, pc}
|
|
entry:
|
|
br label %loop
|
|
loop:
|
|
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
|
|
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
|
|
%v = load i32, ptr %iv
|
|
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
|
|
%v1 = load i32, ptr %iv1
|
|
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
|
|
%v2 = load i32, ptr %iv2
|
|
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
|
|
%v3 = load i32, ptr %iv3
|
|
%s1 = add i32 %s, %v
|
|
%s2 = add i32 %s1, %v1
|
|
%s3 = add i32 %s2, %v2
|
|
%s4 = add i32 %s3, %v3
|
|
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
|
|
store i32 %s4, ptr %iv
|
|
%cmp = icmp eq ptr %iv4, %b
|
|
br i1 %cmp, label %exit, label %loop
|
|
exit:
|
|
ret i32 %s4
|
|
}
|
|
|
|
; @extrastride is a slightly more interesting case of a single
|
|
; complete chain with multiple strides. The test case IR is what LSR
|
|
; used to do, and exactly what we don't want to do. LSR's new IV
|
|
; chaining feature should now undo the damage.
|
|
;
|
|
; no spills
|
|
; only one stride multiple in the preheader
|
|
; no complex address modes or reloads
|
|
define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
|
|
; A9-LABEL: extrastride:
|
|
; A9: @ %bb.0: @ %entry
|
|
; A9-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; A9-NEXT: push {r4, r5, r6, r7, lr}
|
|
; A9-NEXT: ldr.w r12, [sp, #24]
|
|
; A9-NEXT: cmp.w r12, #0
|
|
; A9-NEXT: beq .LBB2_3
|
|
; A9-NEXT: @ %bb.1: @ %for.body.lr.ph
|
|
; A9-NEXT: ldr r4, [sp, #20]
|
|
; A9-NEXT: add.w lr, r3, r1
|
|
; A9-NEXT: lsls r3, r4, #2
|
|
; A9-NEXT: .LBB2_2: @ %for.body
|
|
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: adds r5, r0, r1
|
|
; A9-NEXT: ldr r4, [r0, r1]
|
|
; A9-NEXT: ldr r0, [r0]
|
|
; A9-NEXT: subs.w r12, r12, #1
|
|
; A9-NEXT: ldr r6, [r5, r1]
|
|
; A9-NEXT: add r5, r1
|
|
; A9-NEXT: add r0, r4
|
|
; A9-NEXT: ldr r7, [r5, r1]
|
|
; A9-NEXT: add r5, r1
|
|
; A9-NEXT: add r0, r6
|
|
; A9-NEXT: ldr r4, [r5, r1]
|
|
; A9-NEXT: add r0, r7
|
|
; A9-NEXT: add r0, r4
|
|
; A9-NEXT: str r0, [r2]
|
|
; A9-NEXT: add.w r0, r5, r1
|
|
; A9-NEXT: add r2, r3
|
|
; A9-NEXT: add r0, lr
|
|
; A9-NEXT: bne .LBB2_2
|
|
; A9-NEXT: .LBB2_3: @ %for.end
|
|
; A9-NEXT: pop {r4, r5, r6, r7, pc}
|
|
entry:
|
|
%cmp8 = icmp eq i32 %z, 0
|
|
br i1 %cmp8, label %for.end, label %for.body.lr.ph
|
|
|
|
for.body.lr.ph: ; preds = %entry
|
|
%add.ptr.sum = shl i32 %main_stride, 1 ; s*2
|
|
%add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3
|
|
%add.ptr2.sum = add i32 %x, %main_stride ; s + x
|
|
%add.ptr4.sum = shl i32 %main_stride, 2 ; s*4
|
|
%add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body.lr.ph, %for.body
|
|
%main.addr.011 = phi ptr [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
|
|
%i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
|
|
%res.addr.09 = phi ptr [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ]
|
|
%0 = load i32, ptr %main.addr.011, align 4
|
|
%add.ptr = getelementptr inbounds i8, ptr %main.addr.011, i32 %main_stride
|
|
%1 = load i32, ptr %add.ptr, align 4
|
|
%add.ptr1 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr.sum
|
|
%2 = load i32, ptr %add.ptr1, align 4
|
|
%add.ptr2 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr1.sum
|
|
%3 = load i32, ptr %add.ptr2, align 4
|
|
%add.ptr3 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr4.sum
|
|
%4 = load i32, ptr %add.ptr3, align 4
|
|
%add = add i32 %1, %0
|
|
%add4 = add i32 %add, %2
|
|
%add5 = add i32 %add4, %3
|
|
%add6 = add i32 %add5, %4
|
|
store i32 %add6, ptr %res.addr.09, align 4
|
|
%add.ptr6 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr3.sum
|
|
%add.ptr7 = getelementptr inbounds i32, ptr %res.addr.09, i32 %y
|
|
%inc = add i32 %i.010, 1
|
|
%cmp = icmp eq i32 %inc, %z
|
|
br i1 %cmp, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %entry
|
|
ret void
|
|
}
|
|
|
|
; @foldedidx is an unrolled variant of this loop:
|
|
; for (unsigned long i = 0; i < len; i += s) {
|
|
; c[i] = a[i] + b[i];
|
|
; }
|
|
; where 's' can be folded into the addressing mode.
|
|
; Consequently, we should *not* form any chains.
|
|
define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nounwind ssp {
|
|
; A9-LABEL: foldedidx:
|
|
; A9: @ %bb.0: @ %entry
|
|
; A9-NEXT: .save {r4, r5, r6, lr}
|
|
; A9-NEXT: push {r4, r5, r6, lr}
|
|
; A9-NEXT: mov.w lr, #0
|
|
; A9-NEXT: .LBB3_1: @ %for.body
|
|
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: ldrb.w r12, [r0, lr]
|
|
; A9-NEXT: add.w r4, r1, lr
|
|
; A9-NEXT: ldrb.w r3, [r1, lr]
|
|
; A9-NEXT: add r3, r12
|
|
; A9-NEXT: strb.w r3, [r2, lr]
|
|
; A9-NEXT: add.w r3, r0, lr
|
|
; A9-NEXT: ldrb.w r12, [r3, #1]
|
|
; A9-NEXT: ldrb r5, [r4, #1]
|
|
; A9-NEXT: add r12, r5
|
|
; A9-NEXT: add.w r5, r2, lr
|
|
; A9-NEXT: strb.w r12, [r5, #1]
|
|
; A9-NEXT: add.w lr, lr, #4
|
|
; A9-NEXT: cmp.w lr, #400
|
|
; A9-NEXT: ldrb.w r12, [r3, #2]
|
|
; A9-NEXT: ldrb r6, [r4, #2]
|
|
; A9-NEXT: add r6, r12
|
|
; A9-NEXT: strb r6, [r5, #2]
|
|
; A9-NEXT: ldrb r3, [r3, #3]
|
|
; A9-NEXT: ldrb r6, [r4, #3]
|
|
; A9-NEXT: add r3, r6
|
|
; A9-NEXT: strb r3, [r5, #3]
|
|
; A9-NEXT: bne .LBB3_1
|
|
; A9-NEXT: @ %bb.2: @ %for.end
|
|
; A9-NEXT: pop {r4, r5, r6, pc}
|
|
entry:
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
|
|
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.07
|
|
%0 = load i8, ptr %arrayidx, align 1
|
|
%conv5 = zext i8 %0 to i32
|
|
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.07
|
|
%1 = load i8, ptr %arrayidx1, align 1
|
|
%conv26 = zext i8 %1 to i32
|
|
%add = add nsw i32 %conv26, %conv5
|
|
%conv3 = trunc i32 %add to i8
|
|
%arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.07
|
|
store i8 %conv3, ptr %arrayidx4, align 1
|
|
%inc1 = or disjoint i32 %i.07, 1
|
|
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i32 %inc1
|
|
%2 = load i8, ptr %arrayidx.1, align 1
|
|
%conv5.1 = zext i8 %2 to i32
|
|
%arrayidx1.1 = getelementptr inbounds i8, ptr %b, i32 %inc1
|
|
%3 = load i8, ptr %arrayidx1.1, align 1
|
|
%conv26.1 = zext i8 %3 to i32
|
|
%add.1 = add nsw i32 %conv26.1, %conv5.1
|
|
%conv3.1 = trunc i32 %add.1 to i8
|
|
%arrayidx4.1 = getelementptr inbounds i8, ptr %c, i32 %inc1
|
|
store i8 %conv3.1, ptr %arrayidx4.1, align 1
|
|
%inc.12 = or disjoint i32 %i.07, 2
|
|
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i32 %inc.12
|
|
%4 = load i8, ptr %arrayidx.2, align 1
|
|
%conv5.2 = zext i8 %4 to i32
|
|
%arrayidx1.2 = getelementptr inbounds i8, ptr %b, i32 %inc.12
|
|
%5 = load i8, ptr %arrayidx1.2, align 1
|
|
%conv26.2 = zext i8 %5 to i32
|
|
%add.2 = add nsw i32 %conv26.2, %conv5.2
|
|
%conv3.2 = trunc i32 %add.2 to i8
|
|
%arrayidx4.2 = getelementptr inbounds i8, ptr %c, i32 %inc.12
|
|
store i8 %conv3.2, ptr %arrayidx4.2, align 1
|
|
%inc.23 = or disjoint i32 %i.07, 3
|
|
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i32 %inc.23
|
|
%6 = load i8, ptr %arrayidx.3, align 1
|
|
%conv5.3 = zext i8 %6 to i32
|
|
%arrayidx1.3 = getelementptr inbounds i8, ptr %b, i32 %inc.23
|
|
%7 = load i8, ptr %arrayidx1.3, align 1
|
|
%conv26.3 = zext i8 %7 to i32
|
|
%add.3 = add nsw i32 %conv26.3, %conv5.3
|
|
%conv3.3 = trunc i32 %add.3 to i8
|
|
%arrayidx4.3 = getelementptr inbounds i8, ptr %c, i32 %inc.23
|
|
store i8 %conv3.3, ptr %arrayidx4.3, align 1
|
|
%inc.3 = add nsw i32 %i.07, 4
|
|
%exitcond.3 = icmp eq i32 %inc.3, 400
|
|
br i1 %exitcond.3, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body
|
|
ret void
|
|
}
|
|
|
|
; @testNeon is an important example of the nead for ivchains.
|
|
;
|
|
; Loads and stores should use post-increment addressing, no add's or add.w's.
|
|
; Most importantly, there should be no spills or reloads!
|
|
define hidden void @testNeon(ptr %ref_data, i32 %ref_stride, i32 %limit, ptr nocapture %data) nounwind optsize {
|
|
; A9-LABEL: testNeon:
|
|
; A9: @ %bb.0:
|
|
; A9-NEXT: .save {r4, r5, r7, lr}
|
|
; A9-NEXT: push {r4, r5, r7, lr}
|
|
; A9-NEXT: vmov.i32 q8, #0x0
|
|
; A9-NEXT: cmp r2, #1
|
|
; A9-NEXT: blt .LBB4_4
|
|
; A9-NEXT: @ %bb.1: @ %.lr.ph
|
|
; A9-NEXT: movs r5, #0
|
|
; A9-NEXT: movw r4, #64464
|
|
; A9-NEXT: sub.w r12, r5, r2, lsl #6
|
|
; A9-NEXT: sub.w lr, r1, r1, lsl #4
|
|
; A9-NEXT: movt r4, #65535
|
|
; A9-NEXT: mov r5, r3
|
|
; A9-NEXT: .LBB4_2: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: vld1.64 {d18}, [r0], r1
|
|
; A9-NEXT: subs r2, #1
|
|
; A9-NEXT: vld1.64 {d19}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d18, d19}, [r5]!
|
|
; A9-NEXT: vld1.64 {d20}, [r0], r1
|
|
; A9-NEXT: vld1.64 {d21}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d20, d21}, [r5]!
|
|
; A9-NEXT: vld1.64 {d22}, [r0], r1
|
|
; A9-NEXT: vadd.i8 q9, q9, q10
|
|
; A9-NEXT: vld1.64 {d23}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d22, d23}, [r5]!
|
|
; A9-NEXT: vld1.64 {d20}, [r0], r1
|
|
; A9-NEXT: vadd.i8 q9, q9, q11
|
|
; A9-NEXT: vld1.64 {d21}, [r0], lr
|
|
; A9-NEXT: vadd.i8 q9, q9, q10
|
|
; A9-NEXT: vadd.i8 q8, q8, q9
|
|
; A9-NEXT: vst1.8 {d20, d21}, [r5], r4
|
|
; A9-NEXT: bne .LBB4_2
|
|
; A9-NEXT: @ %bb.3: @ %._crit_edge
|
|
; A9-NEXT: add.w r3, r3, r12, lsl #4
|
|
; A9-NEXT: .LBB4_4:
|
|
; A9-NEXT: vst1.32 {d16, d17}, [r3]
|
|
; A9-NEXT: pop {r4, r5, r7, pc}
|
|
%1 = icmp sgt i32 %limit, 0
|
|
br i1 %1, label %.lr.ph, label %45
|
|
|
|
.lr.ph: ; preds = %0
|
|
%2 = shl nsw i32 %ref_stride, 1
|
|
%3 = mul nsw i32 %ref_stride, 3
|
|
%4 = shl nsw i32 %ref_stride, 2
|
|
%5 = mul nsw i32 %ref_stride, 5
|
|
%6 = mul nsw i32 %ref_stride, 6
|
|
%7 = mul nsw i32 %ref_stride, 7
|
|
%8 = shl nsw i32 %ref_stride, 3
|
|
%9 = sub i32 0, %8
|
|
%10 = mul i32 %limit, -64
|
|
br label %11
|
|
|
|
; <label>:11 ; preds = %11, %.lr.ph
|
|
%.05 = phi ptr [ %ref_data, %.lr.ph ], [ %42, %11 ]
|
|
%counter.04 = phi i32 [ 0, %.lr.ph ], [ %44, %11 ]
|
|
%result.03 = phi <16 x i8> [ zeroinitializer, %.lr.ph ], [ %41, %11 ]
|
|
%.012 = phi ptr [ %data, %.lr.ph ], [ %43, %11 ]
|
|
%12 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %.05, i32 1) nounwind
|
|
%13 = getelementptr inbounds i8, ptr %.05, i32 %ref_stride
|
|
%14 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %13, i32 1) nounwind
|
|
%15 = shufflevector <1 x i64> %12, <1 x i64> %14, <2 x i32> <i32 0, i32 1>
|
|
%16 = bitcast <2 x i64> %15 to <16 x i8>
|
|
%17 = getelementptr inbounds <16 x i8>, ptr %.012, i32 1
|
|
store <16 x i8> %16, ptr %.012, align 4
|
|
%18 = getelementptr inbounds i8, ptr %.05, i32 %2
|
|
%19 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %18, i32 1) nounwind
|
|
%20 = getelementptr inbounds i8, ptr %.05, i32 %3
|
|
%21 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %20, i32 1) nounwind
|
|
%22 = shufflevector <1 x i64> %19, <1 x i64> %21, <2 x i32> <i32 0, i32 1>
|
|
%23 = bitcast <2 x i64> %22 to <16 x i8>
|
|
%24 = getelementptr inbounds <16 x i8>, ptr %.012, i32 2
|
|
store <16 x i8> %23, ptr %17, align 4
|
|
%25 = getelementptr inbounds i8, ptr %.05, i32 %4
|
|
%26 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %25, i32 1) nounwind
|
|
%27 = getelementptr inbounds i8, ptr %.05, i32 %5
|
|
%28 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %27, i32 1) nounwind
|
|
%29 = shufflevector <1 x i64> %26, <1 x i64> %28, <2 x i32> <i32 0, i32 1>
|
|
%30 = bitcast <2 x i64> %29 to <16 x i8>
|
|
%31 = getelementptr inbounds <16 x i8>, ptr %.012, i32 3
|
|
store <16 x i8> %30, ptr %24, align 4
|
|
%32 = getelementptr inbounds i8, ptr %.05, i32 %6
|
|
%33 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %32, i32 1) nounwind
|
|
%34 = getelementptr inbounds i8, ptr %.05, i32 %7
|
|
%35 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr %34, i32 1) nounwind
|
|
%36 = shufflevector <1 x i64> %33, <1 x i64> %35, <2 x i32> <i32 0, i32 1>
|
|
%37 = bitcast <2 x i64> %36 to <16 x i8>
|
|
store <16 x i8> %37, ptr %31, align 4
|
|
%38 = add <16 x i8> %16, %23
|
|
%39 = add <16 x i8> %38, %30
|
|
%40 = add <16 x i8> %39, %37
|
|
%41 = add <16 x i8> %result.03, %40
|
|
%42 = getelementptr i8, ptr %.05, i32 %9
|
|
%43 = getelementptr inbounds <16 x i8>, ptr %.012, i32 -64
|
|
%44 = add nsw i32 %counter.04, 1
|
|
%exitcond = icmp eq i32 %44, %limit
|
|
br i1 %exitcond, label %._crit_edge, label %11
|
|
|
|
._crit_edge: ; preds = %11
|
|
%scevgep = getelementptr <16 x i8>, ptr %data, i32 %10
|
|
br label %45
|
|
|
|
; <label>:45 ; preds = %._crit_edge, %0
|
|
%result.0.lcssa = phi <16 x i8> [ %41, %._crit_edge ], [ zeroinitializer, %0 ]
|
|
%.01.lcssa = phi ptr [ %scevgep, %._crit_edge ], [ %data, %0 ]
|
|
store <16 x i8> %result.0.lcssa, ptr %.01.lcssa, align 4
|
|
ret void
|
|
}
|
|
|
|
declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0(ptr, i32) nounwind readonly
|
|
|
|
; Handle chains in which the same offset is used for both loads and
|
|
; stores to the same array.
|
|
; rdar://11410078.
|
|
define void @testReuse(ptr %src, i32 %stride) nounwind ssp {
|
|
; A9-LABEL: testReuse:
|
|
; A9: @ %bb.0: @ %entry
|
|
; A9-NEXT: sub.w r12, r0, r1, lsl #2
|
|
; A9-NEXT: sub.w r0, r1, r1, lsl #2
|
|
; A9-NEXT: lsls r2, r0, #1
|
|
; A9-NEXT: movs r3, #0
|
|
; A9-NEXT: .LBB5_1: @ %for.body
|
|
; A9-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; A9-NEXT: add.w r0, r12, r3
|
|
; A9-NEXT: adds r3, #8
|
|
; A9-NEXT: vld1.8 {d16}, [r0], r1
|
|
; A9-NEXT: cmp r3, #32
|
|
; A9-NEXT: vld1.8 {d17}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d16, d16, d17
|
|
; A9-NEXT: vld1.8 {d18}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d17, d17, d18
|
|
; A9-NEXT: vld1.8 {d19}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d18, d18, d19
|
|
; A9-NEXT: vld1.8 {d20}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d19, d19, d20
|
|
; A9-NEXT: vld1.8 {d21}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d20, d20, d21
|
|
; A9-NEXT: vld1.8 {d22}, [r0], r1
|
|
; A9-NEXT: vhadd.u8 d21, d21, d22
|
|
; A9-NEXT: vld1.8 {d23}, [r0], r2
|
|
; A9-NEXT: vst1.8 {d16}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d17}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d18}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d19}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d20}, [r0], r1
|
|
; A9-NEXT: vst1.8 {d21}, [r0]
|
|
; A9-NEXT: bne .LBB5_1
|
|
; A9-NEXT: @ %bb.2: @ %for.end
|
|
; A9-NEXT: bx lr
|
|
entry:
|
|
%mul = shl nsw i32 %stride, 2
|
|
%idx.neg = sub i32 0, %mul
|
|
%mul1 = mul nsw i32 %stride, 3
|
|
%idx.neg2 = sub i32 0, %mul1
|
|
%mul5 = shl nsw i32 %stride, 1
|
|
%idx.neg6 = sub i32 0, %mul5
|
|
%idx.neg10 = sub i32 0, %stride
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
%i.0110 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
|
%src.addr = phi ptr [ %src, %entry ], [ %add.ptr45, %for.body ]
|
|
%add.ptr = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg
|
|
%vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr, i32 1)
|
|
%add.ptr3 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg2
|
|
%vld2 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr3, i32 1)
|
|
%add.ptr7 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg6
|
|
%vld3 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr7, i32 1)
|
|
%add.ptr11 = getelementptr inbounds i8, ptr %src.addr, i32 %idx.neg10
|
|
%vld4 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr11, i32 1)
|
|
%vld5 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %src.addr, i32 1)
|
|
%add.ptr17 = getelementptr inbounds i8, ptr %src.addr, i32 %stride
|
|
%vld6 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr17, i32 1)
|
|
%add.ptr20 = getelementptr inbounds i8, ptr %src.addr, i32 %mul5
|
|
%vld7 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr20, i32 1)
|
|
%add.ptr23 = getelementptr inbounds i8, ptr %src.addr, i32 %mul1
|
|
%vld8 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr %add.ptr23, i32 1)
|
|
%vadd1 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld1, <8 x i8> %vld2) nounwind
|
|
%vadd2 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld2, <8 x i8> %vld3) nounwind
|
|
%vadd3 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld3, <8 x i8> %vld4) nounwind
|
|
%vadd4 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld4, <8 x i8> %vld5) nounwind
|
|
%vadd5 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld5, <8 x i8> %vld6) nounwind
|
|
%vadd6 = tail call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %vld6, <8 x i8> %vld7) nounwind
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr3, <8 x i8> %vadd1, i32 1)
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr7, <8 x i8> %vadd2, i32 1)
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr11, <8 x i8> %vadd3, i32 1)
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %src.addr, <8 x i8> %vadd4, i32 1)
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr17, <8 x i8> %vadd5, i32 1)
|
|
tail call void @llvm.arm.neon.vst1.p0.v8i8(ptr %add.ptr20, <8 x i8> %vadd6, i32 1)
|
|
%inc = add nsw i32 %i.0110, 1
|
|
%add.ptr45 = getelementptr inbounds i8, ptr %src.addr, i32 8
|
|
%exitcond = icmp eq i32 %inc, 4
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body
|
|
ret void
|
|
}
|
|
|
|
declare <8 x i8> @llvm.arm.neon.vld1.v8i8.p0(ptr, i32) nounwind readonly
|
|
|
|
declare void @llvm.arm.neon.vst1.p0.v8i8(ptr, <8 x i8>, i32) nounwind
|
|
|
|
declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
|