Ricardo Jesus fc157522c5
[LICM] Prevent fold and hoist of binary ops with over 2 uses (#102114)
This limits folding and hoisting associative binary ops to cases where
the intermediate op has at most two uses.

The more uses the intermediate op has, the more new ops we have to
create to potentially reduce the loop's critical path. We keep the limit
to two uses to minimise undesirable increases in code size.
2024-08-07 09:52:30 +01:00

1173 lines
45 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \
; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + offset
; 4: + offset
;
; chains:
; 1: base: base1 + offset, offsets: (0, offset)
; 2: base: base1 + 3*offset, offsets: (0, offset)
;
; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; long long o4 = base1 + 4 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_same_offset_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: two_chain_same_offset_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB0_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r8, r4, r7
; CHECK-NEXT: add r7, r5, r4
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: add r7, r3, r7
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r7)
; CHECK-NEXT: ldx r8, r7, r4
; CHECK-NEXT: ld r9, 0(r5)
; CHECK-NEXT: ldx r10, r5, r4
; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: mulld r6, r6, r9
; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 3
%mul4 = shl nsw i64 %offset, 2
%cmp46 = icmp sgt i64 %n, 0
br i1 %cmp46, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
%i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.047, %base1
%add.ptr9.idx = add i64 %add, %offset
%add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
%0 = load i64, ptr %add.ptr9, align 8
%add.ptr10.idx = add i64 %add, %mul
%add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
%1 = load i64, ptr %add.ptr10, align 8
%add.ptr11.idx = add i64 %add, %mul2
%add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
%2 = load i64, ptr %add.ptr11, align 8
%add.ptr12.idx = add i64 %add, %mul4
%add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
%3 = load i64, ptr %add.ptr12, align 8
%mul13 = mul i64 %1, %0
%mul14 = mul i64 %mul13, %2
%mul15 = mul i64 %mul14, %3
%add16 = add i64 %mul15, %sum.048
%inc = add nuw nsw i64 %i.047, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + offset
; 4: + offset
; 5: + offset
;
; It can not be commoned to chains because we need a chain for a single address.
; It is not profitable to common chains if not all addresses are in chains.
;
; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; long long o4 = base1 + 4 * offset;
; long long o5 = base1 + 5 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; char *p5 = p + o5;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; unsigned long x5 = *(unsigned long *)(p5 + i);
; sum += x1 * x2 * x3 * x4 * x5;
; }
; return sum;
; }
;
define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: not_perfect_chain_all_same_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB1_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: add r8, r4, r7
; CHECK-NEXT: sldi r9, r4, 2
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r10, r4, r9
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB1_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ldx r6, r5, r4
; CHECK-NEXT: ldx r11, r5, r7
; CHECK-NEXT: ldx r12, r5, r8
; CHECK-NEXT: ldx r0, r5, r9
; CHECK-NEXT: mulld r6, r11, r6
; CHECK-NEXT: ldx r30, r5, r10
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r6, r12
; CHECK-NEXT: mulld r6, r6, r0
; CHECK-NEXT: maddld r3, r6, r30, r3
; CHECK-NEXT: bdnz .LBB1_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB1_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 3
%mul4 = shl nsw i64 %offset, 2
%mul6 = mul nsw i64 %offset, 5
%cmp58 = icmp sgt i64 %n, 0
br i1 %cmp58, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ]
%i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.059, %base1
%add.ptr12.idx = add i64 %add, %offset
%add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
%0 = load i64, ptr %add.ptr12, align 8
%add.ptr13.idx = add i64 %add, %mul
%add.ptr13 = getelementptr inbounds i8, ptr %p, i64 %add.ptr13.idx
%1 = load i64, ptr %add.ptr13, align 8
%add.ptr14.idx = add i64 %add, %mul2
%add.ptr14 = getelementptr inbounds i8, ptr %p, i64 %add.ptr14.idx
%2 = load i64, ptr %add.ptr14, align 8
%add.ptr15.idx = add i64 %add, %mul4
%add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
%3 = load i64, ptr %add.ptr15, align 8
%add.ptr16.idx = add i64 %add, %mul6
%add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
%4 = load i64, ptr %add.ptr16, align 8
%mul17 = mul i64 %1, %0
%mul18 = mul i64 %mul17, %2
%mul19 = mul i64 %mul18, %3
%mul20 = mul i64 %mul19, %4
%add21 = add i64 %mul20, %sum.060
%inc = add nuw nsw i64 %i.059, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1
; 2: + 2*offset
; 3: + offset
;
; We need at least 4 addresses to common 2 chains to reuse at least 1 offset.
;
; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 3 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; sum += x1 * x2 * x3;
; }
; return sum;
; }
;
define i64 @no_enough_elements_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: no_enough_elements_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB2_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: add r4, r4, r7
; CHECK-NEXT: .p2align 5
; CHECK-NEXT: .LBB2_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r5)
; CHECK-NEXT: ldx r8, r5, r7
; CHECK-NEXT: ldx r9, r5, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: maddld r3, r6, r9, r3
; CHECK-NEXT: bdnz .LBB2_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB2_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul1 = mul nsw i64 %offset, 3
%cmp32 = icmp sgt i64 %n, 0
br i1 %cmp32, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ]
%i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add.ptr5.idx = add i64 %i.033, %base1
%add.ptr5 = getelementptr inbounds i8, ptr %p, i64 %add.ptr5.idx
%0 = load i64, ptr %add.ptr5, align 8
%add.ptr6.idx = add i64 %add.ptr5.idx, %mul
%add.ptr6 = getelementptr inbounds i8, ptr %p, i64 %add.ptr6.idx
%1 = load i64, ptr %add.ptr6, align 8
%add.ptr7.idx = add i64 %add.ptr5.idx, %mul1
%add.ptr7 = getelementptr inbounds i8, ptr %p, i64 %add.ptr7.idx
%2 = load i64, ptr %add.ptr7, align 8
%mul8 = mul i64 %1, %0
%mul9 = mul i64 %mul8, %2
%add10 = add i64 %mul9, %sum.034
%inc = add nuw nsw i64 %i.033, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1
; 2: + 2*offset
; 3: + 2*offset
; 4: + 3*offset
;
; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains,
; so we can not common any chains.
;
; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 4 * offset;
; long long o4 = base1 + 7 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @no_reuseable_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: no_reuseable_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB3_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r9, r4, 3
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: sub r4, r9, r4
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB3_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r5)
; CHECK-NEXT: ldx r9, r5, r7
; CHECK-NEXT: ldx r10, r5, r8
; CHECK-NEXT: ldx r11, r5, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r9, r6
; CHECK-NEXT: mulld r6, r6, r10
; CHECK-NEXT: maddld r3, r6, r11, r3
; CHECK-NEXT: bdnz .LBB3_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul1 = shl nsw i64 %offset, 2
%mul3 = mul nsw i64 %offset, 7
%cmp44 = icmp sgt i64 %n, 0
br i1 %cmp44, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
%i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add.ptr8.idx = add i64 %i.045, %base1
%add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
%0 = load i64, ptr %add.ptr8, align 8
%add.ptr9.idx = add i64 %add.ptr8.idx, %mul
%add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
%1 = load i64, ptr %add.ptr9, align 8
%add.ptr10.idx = add i64 %add.ptr8.idx, %mul1
%add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
%2 = load i64, ptr %add.ptr10, align 8
%add.ptr11.idx = add i64 %add.ptr8.idx, %mul3
%add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
%3 = load i64, ptr %add.ptr11, align 8
%mul12 = mul i64 %1, %0
%mul13 = mul i64 %mul12, %2
%mul14 = mul i64 %mul13, %3
%add15 = add i64 %mul14, %sum.046
%inc = add nuw nsw i64 %i.045, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + 3*offset
; 4: + 2*offset
; 5: + 1*offset
; 6: + 2*offset
;
; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5.
; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6
; and address 5(2*offset), so we can not common chains for these addresses.
;
; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 2 * offset;
; long long o3 = base1 + 5 * offset;
; long long o4 = base1 + 7 * offset;
; long long o5 = base1 + 8 * offset;
; long long o6 = base1 + 10 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; char *p5 = p + o5;
; char *p6 = p + o6;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; unsigned long x5 = *(unsigned long *)(p5 + i);
; unsigned long x6 = *(unsigned long *)(p6 + i);
; sum += x1 * x2 * x3 * x4 * x5 * x6;
; }
; return sum;
; }
;
define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: not_same_offset_fail:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB4_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: mulli r11, r4, 10
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: add r8, r4, r8
; CHECK-NEXT: sldi r9, r4, 3
; CHECK-NEXT: sub r10, r9, r4
; CHECK-NEXT: sldi r7, r4, 1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ldx r6, r5, r4
; CHECK-NEXT: ldx r12, r5, r7
; CHECK-NEXT: ldx r0, r5, r8
; CHECK-NEXT: ldx r30, r5, r10
; CHECK-NEXT: mulld r6, r12, r6
; CHECK-NEXT: ldx r29, r5, r9
; CHECK-NEXT: ldx r28, r5, r11
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r6, r0
; CHECK-NEXT: mulld r6, r6, r30
; CHECK-NEXT: mulld r6, r6, r29
; CHECK-NEXT: maddld r3, r6, r28, r3
; CHECK-NEXT: bdnz .LBB4_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB4_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = shl nsw i64 %offset, 1
%mul2 = mul nsw i64 %offset, 5
%mul4 = mul nsw i64 %offset, 7
%mul6 = shl nsw i64 %offset, 3
%mul8 = mul nsw i64 %offset, 10
%cmp70 = icmp sgt i64 %n, 0
br i1 %cmp70, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ]
%i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.071, %base1
%add.ptr15.idx = add i64 %add, %offset
%add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx
%0 = load i64, ptr %add.ptr15, align 8
%add.ptr16.idx = add i64 %add, %mul
%add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx
%1 = load i64, ptr %add.ptr16, align 8
%add.ptr17.idx = add i64 %add, %mul2
%add.ptr17 = getelementptr inbounds i8, ptr %p, i64 %add.ptr17.idx
%2 = load i64, ptr %add.ptr17, align 8
%add.ptr18.idx = add i64 %add, %mul4
%add.ptr18 = getelementptr inbounds i8, ptr %p, i64 %add.ptr18.idx
%3 = load i64, ptr %add.ptr18, align 8
%add.ptr19.idx = add i64 %add, %mul6
%add.ptr19 = getelementptr inbounds i8, ptr %p, i64 %add.ptr19.idx
%4 = load i64, ptr %add.ptr19, align 8
%add.ptr20.idx = add i64 %add, %mul8
%add.ptr20 = getelementptr inbounds i8, ptr %p, i64 %add.ptr20.idx
%5 = load i64, ptr %add.ptr20, align 8
%mul21 = mul i64 %1, %0
%mul22 = mul i64 %mul21, %2
%mul23 = mul i64 %mul22, %3
%mul24 = mul i64 %mul23, %4
%mul25 = mul i64 %mul24, %5
%add26 = add i64 %mul25, %sum.072
%inc = add nuw nsw i64 %i.071, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + offset
; 3: + 3*offset
; 4: + 2*offset
;
; chains:
; 1: base1 + offset, offsets: (0, 2*offset)
; 2: base1 + 4*offset, offsets: (0, 2*offset)
;
; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 3 * offset;
; long long o3 = base1 + 4 * offset;
; long long o4 = base1 + 6 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_different_offsets_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) {
; CHECK-LABEL: two_chain_different_offsets_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: ble cr0, .LBB5_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r8, r4, 2
; CHECK-NEXT: add r7, r5, r4
; CHECK-NEXT: mtctr r6
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: add r7, r3, r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB5_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r6, 0(r7)
; CHECK-NEXT: ldx r8, r7, r4
; CHECK-NEXT: ld r9, 0(r5)
; CHECK-NEXT: ldx r10, r5, r4
; CHECK-NEXT: addi r7, r7, 1
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: mulld r6, r8, r6
; CHECK-NEXT: mulld r6, r6, r9
; CHECK-NEXT: maddld r3, r6, r10, r3
; CHECK-NEXT: bdnz .LBB5_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB5_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = mul nsw i64 %offset, 3
%mul2 = shl nsw i64 %offset, 2
%mul4 = mul nsw i64 %offset, 6
%cmp46 = icmp sgt i64 %n, 0
br i1 %cmp46, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ]
%i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.047, %base1
%add.ptr9.idx = add i64 %add, %offset
%add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
%0 = load i64, ptr %add.ptr9, align 8
%add.ptr10.idx = add i64 %add, %mul
%add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
%1 = load i64, ptr %add.ptr10, align 8
%add.ptr11.idx = add i64 %add, %mul2
%add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
%2 = load i64, ptr %add.ptr11, align 8
%add.ptr12.idx = add i64 %add, %mul4
%add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx
%3 = load i64, ptr %add.ptr12, align 8
%mul13 = mul i64 %1, %0
%mul14 = mul i64 %mul13, %2
%mul15 = mul i64 %mul14, %3
%add16 = add i64 %mul15, %sum.048
%inc = add nuw nsw i64 %i.047, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
; addresses:
; 1: base1 + offset
; 2: + 2*offset
; 3: + base2 - base1 - 2*offset
; 4: + 2*offset
;
; chains:
; 1: base1 + offset, offsets: (0, 2*offset)
; 2: base2 + offset, offsets: (0, 2*offset)
;
; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) {
; long long o1 = base1 + offset;
; long long o2 = base1 + 3 * offset;
; long long o3 = base2 + offset;
; long long o4 = base2 + 3 * offset;
; char *p1 = p + o1;
; char *p2 = p + o2;
; char *p3 = p + o3;
; char *p4 = p + o4;
; long long sum = 0;
; for (long long i = 0; i < n; ++i) {
; unsigned long x1 = *(unsigned long *)(p1 + i);
; unsigned long x2 = *(unsigned long *)(p2 + i);
; unsigned long x3 = *(unsigned long *)(p3 + i);
; unsigned long x4 = *(unsigned long *)(p4 + i);
; sum += x1 * x2 * x3 * x4;
; }
; return sum;
; }
;
define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) {
; CHECK-LABEL: two_chain_two_bases_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r7, 0
; CHECK-NEXT: ble cr0, .LBB6_4
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: add r5, r5, r4
; CHECK-NEXT: add r6, r6, r4
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r5, r3, r5
; CHECK-NEXT: add r6, r3, r6
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB6_2: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: ld r7, 0(r5)
; CHECK-NEXT: ldx r8, r5, r4
; CHECK-NEXT: ld r9, 0(r6)
; CHECK-NEXT: ldx r10, r6, r4
; CHECK-NEXT: addi r5, r5, 1
; CHECK-NEXT: addi r6, r6, 1
; CHECK-NEXT: mulld r7, r8, r7
; CHECK-NEXT: mulld r7, r7, r9
; CHECK-NEXT: maddld r3, r7, r10, r3
; CHECK-NEXT: bdnz .LBB6_2
; CHECK-NEXT: # %bb.3: # %for.cond.cleanup
; CHECK-NEXT: blr
; CHECK-NEXT: .LBB6_4:
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: blr
entry:
%mul = mul nsw i64 %offset, 3
%cmp44 = icmp sgt i64 %n, 0
br i1 %cmp44, label %for.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.body, %entry
%sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ]
ret i64 %sum.0.lcssa
for.body: ; preds = %entry, %for.body
%sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ]
%i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%add = add i64 %i.045, %base1
%add.ptr8.idx = add i64 %add, %offset
%add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx
%0 = load i64, ptr %add.ptr8, align 8
%add1 = add i64 %i.045, %mul
%add.ptr9.idx = add i64 %add1, %base1
%add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx
%1 = load i64, ptr %add.ptr9, align 8
%add2 = add i64 %i.045, %base2
%add.ptr10.idx = add i64 %add2, %offset
%add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx
%2 = load i64, ptr %add.ptr10, align 8
%add.ptr11.idx = add i64 %add2, %mul
%add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx
%3 = load i64, ptr %add.ptr11, align 8
%mul12 = mul i64 %1, %0
%mul13 = mul i64 %mul12, %2
%mul14 = mul i64 %mul13, %3
%add15 = add i64 %mul14, %sum.046
%inc = add nuw nsw i64 %i.045, 1
%exitcond.not = icmp eq i64 %inc, %n
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
;
; Check chain commoning can reduce register pressure to save register spill/reload.
;
; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) {
; inc = inc4;
; #pragma unroll 4
; for (long long i = 0; i < 4 * m; i++) {
; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1];
; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2];
; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3];
; inc = inc + inc4;
; }
; return 0;
; }
;
define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) {
; CHECK-LABEL: spill_reduce_succ:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: cmpdi r6, 0
; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill
; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill
; CHECK-NEXT: ble cr0, .LBB7_7
; CHECK-NEXT: # %bb.1: # %for.body.preheader
; CHECK-NEXT: sldi r6, r6, 2
; CHECK-NEXT: li r7, 1
; CHECK-NEXT: mr r30, r10
; CHECK-NEXT: cmpdi r6, 1
; CHECK-NEXT: iselgt r7, r6, r7
; CHECK-NEXT: addi r8, r7, -1
; CHECK-NEXT: clrldi r6, r7, 63
; CHECK-NEXT: cmpldi r8, 3
; CHECK-NEXT: blt cr0, .LBB7_4
; CHECK-NEXT: # %bb.2: # %for.body.preheader.new
; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: mulli r24, r30, 24
; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: rldicl r0, r7, 62, 2
; CHECK-NEXT: sldi r11, r30, 5
; CHECK-NEXT: sldi r19, r30, 4
; CHECK-NEXT: sldi r7, r14, 3
; CHECK-NEXT: add r14, r30, r14
; CHECK-NEXT: sldi r10, r16, 3
; CHECK-NEXT: sldi r12, r15, 3
; CHECK-NEXT: add r16, r30, r16
; CHECK-NEXT: add r15, r30, r15
; CHECK-NEXT: add r27, r11, r7
; CHECK-NEXT: add r22, r24, r7
; CHECK-NEXT: add r17, r19, r7
; CHECK-NEXT: sldi r2, r14, 3
; CHECK-NEXT: add r26, r24, r10
; CHECK-NEXT: add r25, r24, r12
; CHECK-NEXT: add r21, r19, r10
; CHECK-NEXT: add r20, r19, r12
; CHECK-NEXT: add r8, r11, r10
; CHECK-NEXT: sldi r16, r16, 3
; CHECK-NEXT: add r29, r5, r27
; CHECK-NEXT: add r28, r4, r27
; CHECK-NEXT: add r27, r3, r27
; CHECK-NEXT: add r24, r5, r22
; CHECK-NEXT: add r23, r4, r22
; CHECK-NEXT: add r22, r3, r22
; CHECK-NEXT: add r19, r5, r17
; CHECK-NEXT: add r18, r4, r17
; CHECK-NEXT: add r17, r3, r17
; CHECK-NEXT: add r14, r5, r2
; CHECK-NEXT: add r31, r4, r2
; CHECK-NEXT: add r2, r3, r2
; CHECK-NEXT: add r9, r5, r8
; CHECK-NEXT: add r8, r11, r12
; CHECK-NEXT: add r26, r5, r26
; CHECK-NEXT: add r25, r5, r25
; CHECK-NEXT: add r21, r5, r21
; CHECK-NEXT: add r20, r5, r20
; CHECK-NEXT: add r16, r5, r16
; CHECK-NEXT: add r8, r5, r8
; CHECK-NEXT: rldicl r3, r0, 2, 1
; CHECK-NEXT: addi r3, r3, -4
; CHECK-NEXT: sub r0, r12, r7
; CHECK-NEXT: sub r12, r10, r7
; CHECK-NEXT: li r7, 0
; CHECK-NEXT: mr r10, r30
; CHECK-NEXT: sldi r15, r15, 3
; CHECK-NEXT: add r15, r5, r15
; CHECK-NEXT: rldicl r3, r3, 62, 2
; CHECK-NEXT: addi r3, r3, 1
; CHECK-NEXT: mtctr r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_3: # %for.body
; CHECK-NEXT: #
; CHECK-NEXT: lfd f0, 0(r2)
; CHECK-NEXT: lfd f1, 0(r31)
; CHECK-NEXT: add r3, r10, r30
; CHECK-NEXT: add r3, r3, r30
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r14)
; CHECK-NEXT: add r3, r3, r30
; CHECK-NEXT: add r10, r3, r30
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r14)
; CHECK-NEXT: add r14, r14, r11
; CHECK-NEXT: lfdx f0, r2, r0
; CHECK-NEXT: lfdx f1, r31, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r15, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r15, r7
; CHECK-NEXT: lfdx f0, r2, r12
; CHECK-NEXT: lfdx f1, r31, r12
; CHECK-NEXT: add r2, r2, r11
; CHECK-NEXT: add r31, r31, r11
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r16, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r16, r7
; CHECK-NEXT: lfd f0, 0(r17)
; CHECK-NEXT: lfd f1, 0(r18)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r19, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r19, r7
; CHECK-NEXT: lfdx f0, r17, r0
; CHECK-NEXT: lfdx f1, r18, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r20, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r20, r7
; CHECK-NEXT: lfdx f0, r17, r12
; CHECK-NEXT: lfdx f1, r18, r12
; CHECK-NEXT: add r17, r17, r11
; CHECK-NEXT: add r18, r18, r11
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r21, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r21, r7
; CHECK-NEXT: lfd f0, 0(r22)
; CHECK-NEXT: lfd f1, 0(r23)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r24, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r24, r7
; CHECK-NEXT: lfdx f0, r22, r0
; CHECK-NEXT: lfdx f1, r23, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r25, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r25, r7
; CHECK-NEXT: lfdx f0, r22, r12
; CHECK-NEXT: lfdx f1, r23, r12
; CHECK-NEXT: add r22, r22, r11
; CHECK-NEXT: add r23, r23, r11
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r26, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r26, r7
; CHECK-NEXT: lfd f0, 0(r27)
; CHECK-NEXT: lfd f1, 0(r28)
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r29, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r29, r7
; CHECK-NEXT: lfdx f0, r27, r0
; CHECK-NEXT: lfdx f1, r28, r0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r8, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r8, r7
; CHECK-NEXT: lfdx f0, r27, r12
; CHECK-NEXT: lfdx f1, r28, r12
; CHECK-NEXT: add r27, r27, r11
; CHECK-NEXT: add r28, r28, r11
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r9, r7
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r9, r7
; CHECK-NEXT: add r7, r7, r11
; CHECK-NEXT: bdnz .LBB7_3
; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: beq cr0, .LBB7_7
; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader
; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload
; CHECK-NEXT: sldi r8, r30, 3
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r7, r5, r3
; CHECK-NEXT: add r9, r4, r3
; CHECK-NEXT: add r11, r0, r3
; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r12, r5, r3
; CHECK-NEXT: add r30, r4, r3
; CHECK-NEXT: add r29, r0, r3
; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload
; CHECK-NEXT: add r3, r10, r3
; CHECK-NEXT: li r10, 0
; CHECK-NEXT: sldi r3, r3, 3
; CHECK-NEXT: add r5, r5, r3
; CHECK-NEXT: add r4, r4, r3
; CHECK-NEXT: add r3, r0, r3
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB7_6: # %for.body.epil
; CHECK-NEXT: #
; CHECK-NEXT: lfdx f0, r3, r10
; CHECK-NEXT: lfdx f1, r4, r10
; CHECK-NEXT: addi r6, r6, -1
; CHECK-NEXT: cmpldi r6, 0
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfd f1, 0(r5)
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfd f0, 0(r5)
; CHECK-NEXT: add r5, r5, r8
; CHECK-NEXT: lfdx f0, r29, r10
; CHECK-NEXT: lfdx f1, r30, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r12, r10
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r12, r10
; CHECK-NEXT: lfdx f0, r11, r10
; CHECK-NEXT: lfdx f1, r9, r10
; CHECK-NEXT: xsmuldp f0, f0, f1
; CHECK-NEXT: lfdx f1, r7, r10
; CHECK-NEXT: xsadddp f0, f1, f0
; CHECK-NEXT: stfdx f0, r7, r10
; CHECK-NEXT: add r10, r10, r8
; CHECK-NEXT: bne cr0, .LBB7_6
; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup
; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload
; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload
; CHECK-NEXT: blr
entry:
%cmp49 = icmp sgt i64 %m, 0
br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%0 = shl i64 %m, 2
%smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1)
%1 = add nsw i64 %smax52, -1
%xtraiter = and i64 %smax52, 1
%2 = icmp ult i64 %1, 3
br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
for.body.preheader.new: ; preds = %for.body.preheader
%unroll_iter = and i64 %smax52, 9223372036854775804
br label %for.body
for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
%inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ]
%lcmp.mod.not = icmp eq i64 %xtraiter, 0
br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil
for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
%inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
%epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
%add.epil = add nsw i64 %inc.addr.050.epil, %inc1
%arrayidx.epil = getelementptr inbounds double, ptr %input1, i64 %add.epil
%3 = load double, ptr %arrayidx.epil, align 8
%arrayidx2.epil = getelementptr inbounds double, ptr %input2, i64 %add.epil
%4 = load double, ptr %arrayidx2.epil, align 8
%mul3.epil = fmul double %3, %4
%arrayidx5.epil = getelementptr inbounds double, ptr %output, i64 %add.epil
%5 = load double, ptr %arrayidx5.epil, align 8
%add6.epil = fadd double %5, %mul3.epil
store double %add6.epil, ptr %arrayidx5.epil, align 8
%add7.epil = add nsw i64 %inc.addr.050.epil, %inc2
%arrayidx8.epil = getelementptr inbounds double, ptr %input1, i64 %add7.epil
%6 = load double, ptr %arrayidx8.epil, align 8
%arrayidx10.epil = getelementptr inbounds double, ptr %input2, i64 %add7.epil
%7 = load double, ptr %arrayidx10.epil, align 8
%mul11.epil = fmul double %6, %7
%arrayidx13.epil = getelementptr inbounds double, ptr %output, i64 %add7.epil
%8 = load double, ptr %arrayidx13.epil, align 8
%add14.epil = fadd double %8, %mul11.epil
store double %add14.epil, ptr %arrayidx13.epil, align 8
%add15.epil = add nsw i64 %inc.addr.050.epil, %inc3
%arrayidx16.epil = getelementptr inbounds double, ptr %input1, i64 %add15.epil
%9 = load double, ptr %arrayidx16.epil, align 8
%arrayidx18.epil = getelementptr inbounds double, ptr %input2, i64 %add15.epil
%10 = load double, ptr %arrayidx18.epil, align 8
%mul19.epil = fmul double %9, %10
%arrayidx21.epil = getelementptr inbounds double, ptr %output, i64 %add15.epil
%11 = load double, ptr %arrayidx21.epil, align 8
%add22.epil = fadd double %11, %mul19.epil
store double %add22.epil, ptr %arrayidx21.epil, align 8
%add23.epil = add nsw i64 %inc.addr.050.epil, %inc4
%epil.iter.sub = add nsw i64 %epil.iter, -1
%epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0
br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil
for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
ret i32 0
for.body: ; preds = %for.body, %for.body.preheader.new
%inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ]
%niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
%add = add nsw i64 %inc.addr.050, %inc1
%arrayidx = getelementptr inbounds double, ptr %input1, i64 %add
%12 = load double, ptr %arrayidx, align 8
%arrayidx2 = getelementptr inbounds double, ptr %input2, i64 %add
%13 = load double, ptr %arrayidx2, align 8
%mul3 = fmul double %12, %13
%arrayidx5 = getelementptr inbounds double, ptr %output, i64 %add
%14 = load double, ptr %arrayidx5, align 8
%add6 = fadd double %14, %mul3
store double %add6, ptr %arrayidx5, align 8
%add7 = add nsw i64 %inc.addr.050, %inc2
%arrayidx8 = getelementptr inbounds double, ptr %input1, i64 %add7
%15 = load double, ptr %arrayidx8, align 8
%arrayidx10 = getelementptr inbounds double, ptr %input2, i64 %add7
%16 = load double, ptr %arrayidx10, align 8
%mul11 = fmul double %15, %16
%arrayidx13 = getelementptr inbounds double, ptr %output, i64 %add7
%17 = load double, ptr %arrayidx13, align 8
%add14 = fadd double %17, %mul11
store double %add14, ptr %arrayidx13, align 8
%add15 = add nsw i64 %inc.addr.050, %inc3
%arrayidx16 = getelementptr inbounds double, ptr %input1, i64 %add15
%18 = load double, ptr %arrayidx16, align 8
%arrayidx18 = getelementptr inbounds double, ptr %input2, i64 %add15
%19 = load double, ptr %arrayidx18, align 8
%mul19 = fmul double %18, %19
%arrayidx21 = getelementptr inbounds double, ptr %output, i64 %add15
%20 = load double, ptr %arrayidx21, align 8
%add22 = fadd double %20, %mul19
store double %add22, ptr %arrayidx21, align 8
%add23 = add nsw i64 %inc.addr.050, %inc4
%add.1 = add nsw i64 %add23, %inc1
%arrayidx.1 = getelementptr inbounds double, ptr %input1, i64 %add.1
%21 = load double, ptr %arrayidx.1, align 8
%arrayidx2.1 = getelementptr inbounds double, ptr %input2, i64 %add.1
%22 = load double, ptr %arrayidx2.1, align 8
%mul3.1 = fmul double %21, %22
%arrayidx5.1 = getelementptr inbounds double, ptr %output, i64 %add.1
%23 = load double, ptr %arrayidx5.1, align 8
%add6.1 = fadd double %23, %mul3.1
store double %add6.1, ptr %arrayidx5.1, align 8
%add7.1 = add nsw i64 %add23, %inc2
%arrayidx8.1 = getelementptr inbounds double, ptr %input1, i64 %add7.1
%24 = load double, ptr %arrayidx8.1, align 8
%arrayidx10.1 = getelementptr inbounds double, ptr %input2, i64 %add7.1
%25 = load double, ptr %arrayidx10.1, align 8
%mul11.1 = fmul double %24, %25
%arrayidx13.1 = getelementptr inbounds double, ptr %output, i64 %add7.1
%26 = load double, ptr %arrayidx13.1, align 8
%add14.1 = fadd double %26, %mul11.1
store double %add14.1, ptr %arrayidx13.1, align 8
%add15.1 = add nsw i64 %add23, %inc3
%arrayidx16.1 = getelementptr inbounds double, ptr %input1, i64 %add15.1
%27 = load double, ptr %arrayidx16.1, align 8
%arrayidx18.1 = getelementptr inbounds double, ptr %input2, i64 %add15.1
%28 = load double, ptr %arrayidx18.1, align 8
%mul19.1 = fmul double %27, %28
%arrayidx21.1 = getelementptr inbounds double, ptr %output, i64 %add15.1
%29 = load double, ptr %arrayidx21.1, align 8
%add22.1 = fadd double %29, %mul19.1
store double %add22.1, ptr %arrayidx21.1, align 8
%add23.1 = add nsw i64 %add23, %inc4
%add.2 = add nsw i64 %add23.1, %inc1
%arrayidx.2 = getelementptr inbounds double, ptr %input1, i64 %add.2
%30 = load double, ptr %arrayidx.2, align 8
%arrayidx2.2 = getelementptr inbounds double, ptr %input2, i64 %add.2
%31 = load double, ptr %arrayidx2.2, align 8
%mul3.2 = fmul double %30, %31
%arrayidx5.2 = getelementptr inbounds double, ptr %output, i64 %add.2
%32 = load double, ptr %arrayidx5.2, align 8
%add6.2 = fadd double %32, %mul3.2
store double %add6.2, ptr %arrayidx5.2, align 8
%add7.2 = add nsw i64 %add23.1, %inc2
%arrayidx8.2 = getelementptr inbounds double, ptr %input1, i64 %add7.2
%33 = load double, ptr %arrayidx8.2, align 8
%arrayidx10.2 = getelementptr inbounds double, ptr %input2, i64 %add7.2
%34 = load double, ptr %arrayidx10.2, align 8
%mul11.2 = fmul double %33, %34
%arrayidx13.2 = getelementptr inbounds double, ptr %output, i64 %add7.2
%35 = load double, ptr %arrayidx13.2, align 8
%add14.2 = fadd double %35, %mul11.2
store double %add14.2, ptr %arrayidx13.2, align 8
%add15.2 = add nsw i64 %add23.1, %inc3
%arrayidx16.2 = getelementptr inbounds double, ptr %input1, i64 %add15.2
%36 = load double, ptr %arrayidx16.2, align 8
%arrayidx18.2 = getelementptr inbounds double, ptr %input2, i64 %add15.2
%37 = load double, ptr %arrayidx18.2, align 8
%mul19.2 = fmul double %36, %37
%arrayidx21.2 = getelementptr inbounds double, ptr %output, i64 %add15.2
%38 = load double, ptr %arrayidx21.2, align 8
%add22.2 = fadd double %38, %mul19.2
store double %add22.2, ptr %arrayidx21.2, align 8
%add23.2 = add nsw i64 %add23.1, %inc4
%add.3 = add nsw i64 %add23.2, %inc1
%arrayidx.3 = getelementptr inbounds double, ptr %input1, i64 %add.3
%39 = load double, ptr %arrayidx.3, align 8
%arrayidx2.3 = getelementptr inbounds double, ptr %input2, i64 %add.3
%40 = load double, ptr %arrayidx2.3, align 8
%mul3.3 = fmul double %39, %40
%arrayidx5.3 = getelementptr inbounds double, ptr %output, i64 %add.3
%41 = load double, ptr %arrayidx5.3, align 8
%add6.3 = fadd double %41, %mul3.3
store double %add6.3, ptr %arrayidx5.3, align 8
%add7.3 = add nsw i64 %add23.2, %inc2
%arrayidx8.3 = getelementptr inbounds double, ptr %input1, i64 %add7.3
%42 = load double, ptr %arrayidx8.3, align 8
%arrayidx10.3 = getelementptr inbounds double, ptr %input2, i64 %add7.3
%43 = load double, ptr %arrayidx10.3, align 8
%mul11.3 = fmul double %42, %43
%arrayidx13.3 = getelementptr inbounds double, ptr %output, i64 %add7.3
%44 = load double, ptr %arrayidx13.3, align 8
%add14.3 = fadd double %44, %mul11.3
store double %add14.3, ptr %arrayidx13.3, align 8
%add15.3 = add nsw i64 %add23.2, %inc3
%arrayidx16.3 = getelementptr inbounds double, ptr %input1, i64 %add15.3
%45 = load double, ptr %arrayidx16.3, align 8
%arrayidx18.3 = getelementptr inbounds double, ptr %input2, i64 %add15.3
%46 = load double, ptr %arrayidx18.3, align 8
%mul19.3 = fmul double %45, %46
%arrayidx21.3 = getelementptr inbounds double, ptr %output, i64 %add15.3
%47 = load double, ptr %arrayidx21.3, align 8
%add22.3 = fadd double %47, %mul19.3
store double %add22.3, ptr %arrayidx21.3, align 8
%add23.3 = add nsw i64 %add23.2, %inc4
%niter.nsub.3 = add i64 %niter, -4
%niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0
br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
}
declare i64 @llvm.smax.i64(i64, i64)