
When code calls a function which then immediately tail calls another function there is no need to go via the intermediate function. By branching directly to the target function we reduce the program's working set for a slight increase in runtime performance. Normally it is relatively uncommon to have functions that just tail call another function, but with LLVM control flow integrity we have jump tables that replace the function itself as the canonical address. As a result, when a function address is taken and called directly, for example after a compiler optimization resolves the indirect call, or if code built without control flow integrity calls the function, the call will go via the jump table. The impact of this optimization was measured using a large internal Google benchmark. The results were as follows: CFI enabled: +0.1% ± 0.05% queries per second CFI disabled: +0.01% queries per second [not statistically significant] The optimization is enabled by default at -O2 but may also be enabled or disabled individually with --{,no-}branch-to-branch. This optimization is implemented for AArch64 and X86_64 only. lld's runtime performance (real execution time) after adding this optimization was measured using firefox-x64 from lld-speed-test [1] with ldflags "-O2 -S" on an Apple M2 Ultra. The results are as follows: ``` N Min Max Median Avg Stddev x 512 1.2264546 1.3481076 1.2970261 1.2965788 0.018620888 + 512 1.2561196 1.3839965 1.3214632 1.3209327 0.019443971 Difference at 95.0% confidence 0.0243538 +/- 0.00233202 1.87831% +/- 0.179859% (Student's t, pooled s = 0.0190369) ``` [1] https://discourse.llvm.org/t/improving-the-reproducibility-of-linker-benchmarking/86057 Pull Request: https://github.com/llvm/llvm-project/pull/138366
134 lines
3.7 KiB
ArmAsm
134 lines
3.7 KiB
ArmAsm
# REQUIRES: x86
|
|
|
|
## Test that the branch-to-branch optimization follows the links
|
|
## from f1 -> f2 -> f3 and updates all references to point to f3.
|
|
|
|
# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
|
|
# RUN: ld.lld %t.o -o %t --branch-to-branch --emit-relocs
|
|
# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
|
|
# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
|
|
# RUN: ld.lld %t.o -o %t -O2 --emit-relocs
|
|
# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,B2B %s
|
|
# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,B2B-RELOC %s
|
|
|
|
## Test that branch-to-branch is disabled by default.
|
|
|
|
# RUN: ld.lld %t.o -o %t --emit-relocs
|
|
# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
|
|
# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
|
|
# RUN: ld.lld %t.o -o %t -O2 --no-branch-to-branch --emit-relocs
|
|
# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
|
|
# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
|
|
|
|
## Test that branch-to-branch is disabled for preemptible symbols.
|
|
|
|
# RUN: ld.lld %t.o -o %t --branch-to-branch -shared --emit-relocs
|
|
# RUN: llvm-objdump -d -s %t | FileCheck --check-prefixes=CHECK,NOB2B %s
|
|
# RUN: llvm-objdump -r %t | FileCheck --check-prefixes=RELOC,NOB2B-RELOC %s
|
|
|
|
.section .rodata.vtable,"a"
|
|
.globl vtable
|
|
vtable:
|
|
# B2B: Contents of section .rodata:
|
|
# RELOC: RELOCATION RECORDS FOR [.rodata]:
|
|
# RELOC-NEXT: OFFSET
|
|
# B2B-NEXT: [[VF:[0-9a-f]{8}]]
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3
|
|
# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1
|
|
.4byte f1@PLT - vtable
|
|
# B2B-SAME: [[VF]]
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3+0x4
|
|
# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2+0x4
|
|
.4byte f2@PLT - vtable
|
|
# B2B-SAME: [[VF]]
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3+0x8
|
|
.4byte f3@PLT - vtable
|
|
|
|
# For .rodata.f6
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
|
|
.section .text._start,"ax"
|
|
.globl _start
|
|
# CHECK: <_start>:
|
|
# RELOC: RELOCATION RECORDS FOR [.text]:
|
|
# RELOC-NEXT: OFFSET
|
|
_start:
|
|
# B2B-NEXT: jmp {{.*}} <f3>
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
# NOB2B-NEXT: jmp {{.*}} <f1{{.*}}>
|
|
# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f1-0x4
|
|
jmp f1
|
|
# B2B-NEXT: jmp {{.*}} <f3>
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
# NOB2B-NEXT: jmp {{.*}} <f2{{.*}}>
|
|
# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
|
|
jmp f2
|
|
# This will assemble to a relocation pointing to an STT_SECTION for .text.f4
|
|
# with an addend, which looks similar to the relative vtable cases above but
|
|
# requires different handling of the addend so that we don't think this is
|
|
# branching to the `jmp f3` at the start of the target section.
|
|
# CHECK-NEXT: jmp {{.*}} <f4{{.*}}>
|
|
# RELOC-NEXT: R_X86_64_PLT32 .text+0x2e
|
|
jmp f4
|
|
# B2B-NEXT: jmp 0x[[IPLT:[0-9a-f]*]]
|
|
# RELOC-NEXT: R_X86_64_PLT32 f5-0x4
|
|
jmp f5
|
|
# B2B-NEXT: jmp {{.*}} <f6>
|
|
# RELOC-NEXT: R_X86_64_PLT32 f6-0x4
|
|
jmp f6
|
|
# B2B-NEXT: jmp {{.*}} <f7>
|
|
# RELOC-NEXT: R_X86_64_PLT32 f7-0x4
|
|
jmp f7
|
|
|
|
.section .text.f1,"ax"
|
|
.globl f1
|
|
f1:
|
|
# B2B-RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
# NOB2B-RELOC-NEXT: R_X86_64_PLT32 f2-0x4
|
|
jmp f2
|
|
|
|
.section .text.f2,"ax"
|
|
.globl f2
|
|
# CHECK: <f2>:
|
|
f2:
|
|
# CHECK-NEXT: jmp {{.*}} <f3{{.*}}>
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
jmp f3
|
|
|
|
.section .text.f3,"ax"
|
|
.globl f3
|
|
f3:
|
|
# Test that a self-branch doesn't trigger an infinite loop.
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
jmp f3
|
|
|
|
.section .text.f4,"ax"
|
|
jmp f3
|
|
f4:
|
|
ret
|
|
|
|
.section .text.f5,"ax"
|
|
.type f5, @gnu_indirect_function
|
|
.globl f5
|
|
f5:
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
jmp f3
|
|
|
|
.section .rodata.f6,"a"
|
|
.globl f6
|
|
f6:
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
jmp f3
|
|
|
|
# RELOC: RELOCATION RECORDS FOR [.wtext.f7]:
|
|
# RELOC-NEXT: OFFSET
|
|
|
|
.section .wtext.f7,"awx"
|
|
.globl f7
|
|
f7:
|
|
# RELOC-NEXT: R_X86_64_PLT32 f3-0x4
|
|
jmp f3
|
|
|
|
# B2B: <.iplt>:
|
|
# B2B-NEXT: [[IPLT]]:
|