Sanjay Patel f0dd12ec5c [x86] use zero-extending load of a byte outside of loops too (2nd try)
The first attempt missed changing test files for tools
(update_llc_test_checks.py).

Original commit message:

This implements the main suggested change from issue #56498.
Using the shorter (non-extending) instruction with only
-Oz ("minsize") rather than -Os ("optsize") is left as a
possible follow-up.

As noted in the bug report, the zero-extending load may have
shorter latency/better throughput across a wide range of x86
micro-arches, and it avoids a potential false dependency.
The cost is an extra instruction byte.

This could cause perf ups and downs from secondary effects,
but I don't think it is possible to account for those in
advance, and that will likely also depend on exact micro-arch.
This does bring LLVM x86 codegen more in line with existing
gcc codegen, so if problems are exposed they are more likely
to occur for both compilers.

Differential Revision: https://reviews.llvm.org/D129775
2022-07-19 21:27:08 -04:00

43 lines
1.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s
@d = global i32 0, align 4
; Verify the sar happens before ecx is clobbered with the parameter being
; passed to fn3
define i32 @fn4(i32 %i) #0 {
; CHECK-LABEL: fn4:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: subl $8, %esp
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movzbl d, %ecx
; CHECK-NEXT: movl %eax, %esi
; CHECK-NEXT: sarl %cl, %esi
; CHECK-NEXT: subl $8, %esp
; CHECK-NEXT: movl $2, %ecx
; CHECK-NEXT: movl $5, %edx
; CHECK-NEXT: pushl %eax
; CHECK-NEXT: pushl %esi
; CHECK-NEXT: calll fn3@PLT
; CHECK-NEXT: addl $16, %esp
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testl %esi, %esi
; CHECK-NEXT: setle %al
; CHECK-NEXT: addl $8, %esp
; CHECK-NEXT: popl %esi
; CHECK-NEXT: retl
entry:
%0 = load i32, ptr @d, align 4
%shr = ashr i32 %i, %0
tail call fastcc void @fn3(i32 2, i32 5, i32 %shr, i32 %i)
%cmp = icmp slt i32 %shr, 1
%. = zext i1 %cmp to i32
ret i32 %.
}
declare void @fn3(i32 %p1, i32 %p2, i32 %p3, i32 %p4) #0
attributes #0 = { nounwind }