Nikita Popov eecb99c5f6 [Tests] Add disjoint flag to some tests (NFC)
These tests rely on SCEV looking recognizing an "or" with no common
bits as an "add". Add the disjoint flag to relevant or instructions
in preparation for switching SCEV to use the flag instead of the
ValueTracking query. The IR with disjoint flag matches what
InstCombine would produce.
2023-12-05 14:09:36 +01:00

568 lines
19 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O3 -mtriple=x86_64-unknown-unknown -mcpu=core2 | FileCheck %s -check-prefix=X64
; RUN: llc < %s -O3 -mtriple=i686-unknown-unknown -mcpu=core2 | FileCheck %s -check-prefix=X32
; @simple is the most basic chain of address induction variables. Chaining
; saves at least one register and avoids complex addressing and setup
; code.
;
; %x * 4
; no other address computation in the preheader
; no complex address modes
;
; no expensive address computation in the preheader
; no complex address modes
define i32 @simple(ptr %a, ptr %b, i32 %x) nounwind {
; X64-LABEL: simple:
; X64: # %bb.0: # %entry
; X64-NEXT: movslq %edx, %rcx
; X64-NEXT: shlq $2, %rcx
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: leaq (%rcx,%rcx), %rdx
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB0_1: # %loop
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: addl (%rdi), %eax
; X64-NEXT: addl (%rdi,%rcx), %eax
; X64-NEXT: leaq (%rdi,%rcx), %r8
; X64-NEXT: addl (%rcx,%r8), %eax
; X64-NEXT: addq %rcx, %r8
; X64-NEXT: addl (%rcx,%r8), %eax
; X64-NEXT: addq %rdx, %r8
; X64-NEXT: movq %r8, %rdi
; X64-NEXT: cmpq %rsi, %r8
; X64-NEXT: jne .LBB0_1
; X64-NEXT: # %bb.2: # %exit
; X64-NEXT: retq
;
; X32-LABEL: simple:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: shll $2, %edx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: leal (%edx,%edx), %esi
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB0_1: # %loop
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: addl (%edi), %eax
; X32-NEXT: addl (%edi,%edx), %eax
; X32-NEXT: leal (%edi,%edx), %ebx
; X32-NEXT: addl (%edx,%ebx), %eax
; X32-NEXT: addl %edx, %ebx
; X32-NEXT: addl (%edx,%ebx), %eax
; X32-NEXT: addl %esi, %ebx
; X32-NEXT: movl %ebx, %edi
; X32-NEXT: cmpl %ecx, %ebx
; X32-NEXT: jne .LBB0_1
; X32-NEXT: # %bb.2: # %exit
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
entry:
br label %loop
loop:
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
%v = load i32, ptr %iv
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
%v1 = load i32, ptr %iv1
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
%v2 = load i32, ptr %iv2
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
%v3 = load i32, ptr %iv3
%s1 = add i32 %s, %v
%s2 = add i32 %s1, %v1
%s3 = add i32 %s2, %v2
%s4 = add i32 %s3, %v3
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
%cmp = icmp eq ptr %iv4, %b
br i1 %cmp, label %exit, label %loop
exit:
ret i32 %s4
}
; @user is not currently chained because the IV is live across memory ops.
;
; expensive address computation in the preheader
; complex address modes
define i32 @user(ptr %a, ptr %b, i32 %x) nounwind {
; X64-LABEL: user:
; X64: # %bb.0: # %entry
; X64-NEXT: movslq %edx, %rcx
; X64-NEXT: movq %rcx, %rdx
; X64-NEXT: shlq $4, %rdx
; X64-NEXT: leaq (,%rcx,4), %rax
; X64-NEXT: leaq (%rax,%rax,2), %r8
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB1_1: # %loop
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: addl (%rdi), %eax
; X64-NEXT: addl (%rdi,%rcx,4), %eax
; X64-NEXT: addl (%rdi,%rcx,8), %eax
; X64-NEXT: addl (%rdi,%r8), %eax
; X64-NEXT: movl %eax, (%rdi)
; X64-NEXT: addq %rdx, %rdi
; X64-NEXT: cmpq %rsi, %rdi
; X64-NEXT: jne .LBB1_1
; X64-NEXT: # %bb.2: # %exit
; X64-NEXT: retq
;
; X32-LABEL: user:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl %ecx, %edi
; X32-NEXT: shll $4, %edi
; X32-NEXT: leal (,%ecx,4), %eax
; X32-NEXT: leal (%eax,%eax,2), %ebx
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB1_1: # %loop
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: addl (%esi), %eax
; X32-NEXT: addl (%esi,%ecx,4), %eax
; X32-NEXT: addl (%esi,%ecx,8), %eax
; X32-NEXT: addl (%esi,%ebx), %eax
; X32-NEXT: movl %eax, (%esi)
; X32-NEXT: addl %edi, %esi
; X32-NEXT: cmpl %edx, %esi
; X32-NEXT: jne .LBB1_1
; X32-NEXT: # %bb.2: # %exit
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
entry:
br label %loop
loop:
%iv = phi ptr [ %a, %entry ], [ %iv4, %loop ]
%s = phi i32 [ 0, %entry ], [ %s4, %loop ]
%v = load i32, ptr %iv
%iv1 = getelementptr inbounds i32, ptr %iv, i32 %x
%v1 = load i32, ptr %iv1
%iv2 = getelementptr inbounds i32, ptr %iv1, i32 %x
%v2 = load i32, ptr %iv2
%iv3 = getelementptr inbounds i32, ptr %iv2, i32 %x
%v3 = load i32, ptr %iv3
%s1 = add i32 %s, %v
%s2 = add i32 %s1, %v1
%s3 = add i32 %s2, %v2
%s4 = add i32 %s3, %v3
%iv4 = getelementptr inbounds i32, ptr %iv3, i32 %x
store i32 %s4, ptr %iv
%cmp = icmp eq ptr %iv4, %b
br i1 %cmp, label %exit, label %loop
exit:
ret i32 %s4
}
; @extrastride is a slightly more interesting case of a single
; complete chain with multiple strides. The test case IR is what LSR
; used to do, and exactly what we don't want to do. LSR's new IV
; chaining feature should now undo the damage.
;
; We currently don't handle this on X64 because the sexts cause
; strange increment expressions like this:
; IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
;
; For x32, no spills in the preheader, no complex address modes, no reloads.
define void @extrastride(ptr nocapture %main, i32 %main_stride, ptr nocapture %res, i32 %x, i32 %y, i32 %z) nounwind {
; X64-LABEL: extrastride:
; X64: # %bb.0: # %entry
; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
; X64-NEXT: # kill: def $esi killed $esi def $rsi
; X64-NEXT: testl %r9d, %r9d
; X64-NEXT: je .LBB2_4
; X64-NEXT: # %bb.1: # %for.body.lr.ph
; X64-NEXT: pushq %rbx
; X64-NEXT: leal (%rsi,%rsi), %r10d
; X64-NEXT: leal (%rsi,%rsi,2), %r11d
; X64-NEXT: addl %esi, %ecx
; X64-NEXT: leal (,%rsi,4), %eax
; X64-NEXT: leal (%rcx,%rsi,4), %ebx
; X64-NEXT: cltq
; X64-NEXT: movslq %r11d, %rcx
; X64-NEXT: movslq %r10d, %r10
; X64-NEXT: movslq %esi, %rsi
; X64-NEXT: movslq %r8d, %r8
; X64-NEXT: shlq $2, %r8
; X64-NEXT: movslq %ebx, %r11
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB2_2: # %for.body
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl (%rdi,%rsi), %ebx
; X64-NEXT: addl (%rdi), %ebx
; X64-NEXT: addl (%rdi,%r10), %ebx
; X64-NEXT: addl (%rdi,%rcx), %ebx
; X64-NEXT: addl (%rdi,%rax), %ebx
; X64-NEXT: movl %ebx, (%rdx)
; X64-NEXT: addq %r11, %rdi
; X64-NEXT: addq %r8, %rdx
; X64-NEXT: decl %r9d
; X64-NEXT: jne .LBB2_2
; X64-NEXT: # %bb.3:
; X64-NEXT: popq %rbx
; X64-NEXT: .LBB2_4: # %for.end
; X64-NEXT: retq
;
; X32-LABEL: extrastride:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebp
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: testl %eax, %eax
; X32-NEXT: je .LBB2_3
; X32-NEXT: # %bb.1: # %for.body.lr.ph
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
; X32-NEXT: addl %esi, %edi
; X32-NEXT: shll $2, %ecx
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB2_2: # %for.body
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: movl (%ebx,%esi), %ebp
; X32-NEXT: addl (%ebx), %ebp
; X32-NEXT: addl %esi, %ebx
; X32-NEXT: addl (%esi,%ebx), %ebp
; X32-NEXT: addl %esi, %ebx
; X32-NEXT: addl (%esi,%ebx), %ebp
; X32-NEXT: addl %esi, %ebx
; X32-NEXT: addl (%esi,%ebx), %ebp
; X32-NEXT: movl %ebp, (%edx)
; X32-NEXT: addl %esi, %ebx
; X32-NEXT: addl %edi, %ebx
; X32-NEXT: addl %ecx, %edx
; X32-NEXT: decl %eax
; X32-NEXT: jne .LBB2_2
; X32-NEXT: .LBB2_3: # %for.end
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: popl %ebp
; X32-NEXT: retl
entry:
%cmp8 = icmp eq i32 %z, 0
br i1 %cmp8, label %for.end, label %for.body.lr.ph
for.body.lr.ph: ; preds = %entry
%add.ptr.sum = shl i32 %main_stride, 1 ; s*2
%add.ptr1.sum = add i32 %add.ptr.sum, %main_stride ; s*3
%add.ptr2.sum = add i32 %x, %main_stride ; s + x
%add.ptr4.sum = shl i32 %main_stride, 2 ; s*4
%add.ptr3.sum = add i32 %add.ptr2.sum, %add.ptr4.sum ; total IV stride = s*5+x
br label %for.body
for.body: ; preds = %for.body.lr.ph, %for.body
%main.addr.011 = phi ptr [ %main, %for.body.lr.ph ], [ %add.ptr6, %for.body ]
%i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
%res.addr.09 = phi ptr [ %res, %for.body.lr.ph ], [ %add.ptr7, %for.body ]
%0 = load i32, ptr %main.addr.011, align 4
%add.ptr = getelementptr inbounds i8, ptr %main.addr.011, i32 %main_stride
%1 = load i32, ptr %add.ptr, align 4
%add.ptr1 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr.sum
%2 = load i32, ptr %add.ptr1, align 4
%add.ptr2 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr1.sum
%3 = load i32, ptr %add.ptr2, align 4
%add.ptr3 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr4.sum
%4 = load i32, ptr %add.ptr3, align 4
%add = add i32 %1, %0
%add4 = add i32 %add, %2
%add5 = add i32 %add4, %3
%add6 = add i32 %add5, %4
store i32 %add6, ptr %res.addr.09, align 4
%add.ptr6 = getelementptr inbounds i8, ptr %main.addr.011, i32 %add.ptr3.sum
%add.ptr7 = getelementptr inbounds i32, ptr %res.addr.09, i32 %y
%inc = add i32 %i.010, 1
%cmp = icmp eq i32 %inc, %z
br i1 %cmp, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
ret void
}
; @foldedidx is an unrolled variant of this loop:
; for (unsigned long i = 0; i < len; i += s) {
; c[i] = a[i] + b[i];
; }
; where 's' can be folded into the addressing mode.
; Consequently, we should *not* form any chains.
define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nounwind ssp {
; X64-LABEL: foldedidx:
; X64: # %bb.0: # %entry
; X64-NEXT: movl $3, %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB3_1: # %for.body
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movzbl -3(%rdi,%rax), %ecx
; X64-NEXT: movzbl -3(%rsi,%rax), %r8d
; X64-NEXT: addl %ecx, %r8d
; X64-NEXT: movb %r8b, -3(%rdx,%rax)
; X64-NEXT: movzbl -2(%rdi,%rax), %ecx
; X64-NEXT: movzbl -2(%rsi,%rax), %r8d
; X64-NEXT: addl %ecx, %r8d
; X64-NEXT: movb %r8b, -2(%rdx,%rax)
; X64-NEXT: movzbl -1(%rdi,%rax), %ecx
; X64-NEXT: movzbl -1(%rsi,%rax), %r8d
; X64-NEXT: addl %ecx, %r8d
; X64-NEXT: movb %r8b, -1(%rdx,%rax)
; X64-NEXT: movzbl (%rdi,%rax), %ecx
; X64-NEXT: movzbl (%rsi,%rax), %r8d
; X64-NEXT: addl %ecx, %r8d
; X64-NEXT: movb %r8b, (%rdx,%rax)
; X64-NEXT: addq $4, %rax
; X64-NEXT: cmpl $403, %eax # imm = 0x193
; X64-NEXT: jne .LBB3_1
; X64-NEXT: # %bb.2: # %for.end
; X64-NEXT: retq
;
; X32-LABEL: foldedidx:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %edi
; X32-NEXT: pushl %esi
; X32-NEXT: movl $3, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB3_1: # %for.body
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: movzbl -3(%esi,%eax), %edi
; X32-NEXT: movzbl -3(%edx,%eax), %ebx
; X32-NEXT: addl %edi, %ebx
; X32-NEXT: movb %bl, -3(%ecx,%eax)
; X32-NEXT: movzbl -2(%esi,%eax), %edi
; X32-NEXT: movzbl -2(%edx,%eax), %ebx
; X32-NEXT: addl %edi, %ebx
; X32-NEXT: movb %bl, -2(%ecx,%eax)
; X32-NEXT: movzbl -1(%esi,%eax), %edi
; X32-NEXT: movzbl -1(%edx,%eax), %ebx
; X32-NEXT: addl %edi, %ebx
; X32-NEXT: movb %bl, -1(%ecx,%eax)
; X32-NEXT: movzbl (%esi,%eax), %edi
; X32-NEXT: movzbl (%edx,%eax), %ebx
; X32-NEXT: addl %edi, %ebx
; X32-NEXT: movb %bl, (%ecx,%eax)
; X32-NEXT: addl $4, %eax
; X32-NEXT: cmpl $403, %eax # imm = 0x193
; X32-NEXT: jne .LBB3_1
; X32-NEXT: # %bb.2: # %for.end
; X32-NEXT: popl %esi
; X32-NEXT: popl %edi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
%i.07 = phi i32 [ 0, %entry ], [ %inc.3, %for.body ]
%arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.07
%0 = load i8, ptr %arrayidx, align 1
%conv5 = zext i8 %0 to i32
%arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.07
%1 = load i8, ptr %arrayidx1, align 1
%conv26 = zext i8 %1 to i32
%add = add nsw i32 %conv26, %conv5
%conv3 = trunc i32 %add to i8
%arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.07
store i8 %conv3, ptr %arrayidx4, align 1
%inc1 = or disjoint i32 %i.07, 1
%arrayidx.1 = getelementptr inbounds i8, ptr %a, i32 %inc1
%2 = load i8, ptr %arrayidx.1, align 1
%conv5.1 = zext i8 %2 to i32
%arrayidx1.1 = getelementptr inbounds i8, ptr %b, i32 %inc1
%3 = load i8, ptr %arrayidx1.1, align 1
%conv26.1 = zext i8 %3 to i32
%add.1 = add nsw i32 %conv26.1, %conv5.1
%conv3.1 = trunc i32 %add.1 to i8
%arrayidx4.1 = getelementptr inbounds i8, ptr %c, i32 %inc1
store i8 %conv3.1, ptr %arrayidx4.1, align 1
%inc.12 = or disjoint i32 %i.07, 2
%arrayidx.2 = getelementptr inbounds i8, ptr %a, i32 %inc.12
%4 = load i8, ptr %arrayidx.2, align 1
%conv5.2 = zext i8 %4 to i32
%arrayidx1.2 = getelementptr inbounds i8, ptr %b, i32 %inc.12
%5 = load i8, ptr %arrayidx1.2, align 1
%conv26.2 = zext i8 %5 to i32
%add.2 = add nsw i32 %conv26.2, %conv5.2
%conv3.2 = trunc i32 %add.2 to i8
%arrayidx4.2 = getelementptr inbounds i8, ptr %c, i32 %inc.12
store i8 %conv3.2, ptr %arrayidx4.2, align 1
%inc.23 = or disjoint i32 %i.07, 3
%arrayidx.3 = getelementptr inbounds i8, ptr %a, i32 %inc.23
%6 = load i8, ptr %arrayidx.3, align 1
%conv5.3 = zext i8 %6 to i32
%arrayidx1.3 = getelementptr inbounds i8, ptr %b, i32 %inc.23
%7 = load i8, ptr %arrayidx1.3, align 1
%conv26.3 = zext i8 %7 to i32
%add.3 = add nsw i32 %conv26.3, %conv5.3
%conv3.3 = trunc i32 %add.3 to i8
%arrayidx4.3 = getelementptr inbounds i8, ptr %c, i32 %inc.23
store i8 %conv3.3, ptr %arrayidx4.3, align 1
%inc.3 = add nsw i32 %i.07, 4
%exitcond.3 = icmp eq i32 %inc.3, 400
br i1 %exitcond.3, label %for.end, label %for.body
for.end: ; preds = %for.body
ret void
}
; @multioper tests instructions with multiple IV user operands. We
; should be able to chain them independent of each other.
define void @multioper(ptr %a, i32 %n) nounwind {
; X64-LABEL: multioper:
; X64: # %bb.0: # %entry
; X64-NEXT: xorl %eax, %eax
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB4_1: # %for.body
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl %eax, (%rdi,%rax,4)
; X64-NEXT: leal 1(%rax), %ecx
; X64-NEXT: movl %ecx, 4(%rdi,%rax,4)
; X64-NEXT: leal 2(%rax), %ecx
; X64-NEXT: movl %ecx, 8(%rdi,%rax,4)
; X64-NEXT: leal 3(%rax), %ecx
; X64-NEXT: movl %ecx, 12(%rdi,%rax,4)
; X64-NEXT: addq $4, %rax
; X64-NEXT: cmpl %esi, %eax
; X64-NEXT: jl .LBB4_1
; X64-NEXT: # %bb.2: # %exit
; X64-NEXT: retq
;
; X32-LABEL: multioper:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %esi
; X32-NEXT: xorl %eax, %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB4_1: # %for.body
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: movl %eax, (%edx,%eax,4)
; X32-NEXT: leal 1(%eax), %esi
; X32-NEXT: movl %esi, 4(%edx,%eax,4)
; X32-NEXT: leal 2(%eax), %esi
; X32-NEXT: movl %esi, 8(%edx,%eax,4)
; X32-NEXT: leal 3(%eax), %esi
; X32-NEXT: movl %esi, 12(%edx,%eax,4)
; X32-NEXT: addl $4, %eax
; X32-NEXT: cmpl %ecx, %eax
; X32-NEXT: jl .LBB4_1
; X32-NEXT: # %bb.2: # %exit
; X32-NEXT: popl %esi
; X32-NEXT: retl
entry:
br label %for.body
for.body:
%p = phi ptr [ %p.next, %for.body ], [ %a, %entry ]
%i = phi i32 [ %inc4, %for.body ], [ 0, %entry ]
store i32 %i, ptr %p, align 4
%inc1 = or disjoint i32 %i, 1
%add.ptr.i1 = getelementptr inbounds i32, ptr %p, i32 1
store i32 %inc1, ptr %add.ptr.i1, align 4
%inc2 = add nsw i32 %i, 2
%add.ptr.i2 = getelementptr inbounds i32, ptr %p, i32 2
store i32 %inc2, ptr %add.ptr.i2, align 4
%inc3 = add nsw i32 %i, 3
%add.ptr.i3 = getelementptr inbounds i32, ptr %p, i32 3
store i32 %inc3, ptr %add.ptr.i3, align 4
%p.next = getelementptr inbounds i32, ptr %p, i32 4
%inc4 = add nsw i32 %i, 4
%cmp = icmp slt i32 %inc4, %n
br i1 %cmp, label %for.body, label %exit
exit:
ret void
}
; @testCmpZero has a ICmpZero LSR use that should not be hidden from
; LSR. Profitable chains should have more than one nonzero increment
; anyway.
define void @testCmpZero(ptr %src, ptr %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp {
; X64-LABEL: testCmpZero:
; X64: # %bb.0: # %entry
; X64-NEXT: movslq %edx, %rdx
; X64-NEXT: addq %rdx, %rdi
; X64-NEXT: movslq %ecx, %rax
; X64-NEXT: addq %rsi, %rax
; X64-NEXT: addl %edx, %r8d
; X64-NEXT: movslq %r8d, %rcx
; X64-NEXT: subq %rdx, %rcx
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: .p2align 4, 0x90
; X64-NEXT: .LBB5_1: # %for.body82.us
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movzbl (%rax,%rdx,4), %esi
; X64-NEXT: movb %sil, (%rdi,%rdx)
; X64-NEXT: incq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB5_1
; X64-NEXT: # %bb.2: # %return
; X64-NEXT: retq
;
; X32-LABEL: testCmpZero:
; X32: # %bb.0: # %entry
; X32-NEXT: pushl %ebx
; X32-NEXT: pushl %esi
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: movl {{[0-9]+}}(%esp), %edx
; X32-NEXT: addl {{[0-9]+}}(%esp), %edx
; X32-NEXT: xorl %esi, %esi
; X32-NEXT: .p2align 4, 0x90
; X32-NEXT: .LBB5_1: # %for.body82.us
; X32-NEXT: # =>This Inner Loop Header: Depth=1
; X32-NEXT: movzbl (%edx,%esi,4), %ebx
; X32-NEXT: movb %bl, (%ecx,%esi)
; X32-NEXT: incl %esi
; X32-NEXT: cmpl %esi, %eax
; X32-NEXT: jne .LBB5_1
; X32-NEXT: # %bb.2: # %return
; X32-NEXT: popl %esi
; X32-NEXT: popl %ebx
; X32-NEXT: retl
entry:
%dest0 = getelementptr inbounds i8, ptr %src, i32 %srcidx
%source0 = getelementptr inbounds i8, ptr %dst, i32 %dstidx
%add.ptr79.us.sum = add i32 %srcidx, %len
%lftr.limit = getelementptr i8, ptr %src, i32 %add.ptr79.us.sum
br label %for.body82.us
for.body82.us:
%dest = phi ptr [ %dest0, %entry ], [ %incdec.ptr91.us, %for.body82.us ]
%source = phi ptr [ %source0, %entry ], [ %add.ptr83.us, %for.body82.us ]
%0 = load i32, ptr %source, align 4
%trunc = trunc i32 %0 to i8
%add.ptr83.us = getelementptr inbounds i8, ptr %source, i32 4
%incdec.ptr91.us = getelementptr inbounds i8, ptr %dest, i32 1
store i8 %trunc, ptr %dest, align 1
%exitcond = icmp eq ptr %incdec.ptr91.us, %lftr.limit
br i1 %exitcond, label %return, label %for.body82.us
return:
ret void
}