Change `CountersPtr` in `__profd_` to a label difference, which is a link-time
constant. On ELF, when linking a shared object, this requires that `__profc_` is
either private or linkonce/linkonce_odr hidden. On COFF, we need D104564 so that
`.quad a-b` (64-bit label difference) can lower to a 32-bit PC-relative relocation.
```
# ELF: R_X86_64_PC64 (PC-relative)
.quad .L__profc_foo-.L__profd_foo
# Mach-O: a pair of 8-byte X86_64_RELOC_UNSIGNED and X86_64_RELOC_SUBTRACTOR
.quad l___profc_foo-l___profd_foo
# COFF: we actually use IMAGE_REL_AMD64_REL32/IMAGE_REL_ARM64_REL32 so
# the high 32-bit value is zero even if .L__profc_foo < .L__profd_foo
# As compensation, we truncate CountersDelta in the header so that
# __llvm_profile_merge_from_buffer and llvm-profdata reader keep working.
.quad .L__profc_foo-.L__profd_foo
```
(Note: link.exe sorts `.lprfc` before `.lprfd` even if the object writer
has `.lprfd` before `.lprfc`, so we cannot work around by reordering
`.lprfc` and `.lprfd`.)
With this change, a stage 2 (`-DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_BUILD_INSTRUMENTED=IR`)
`ld -pie` linked clang is 1.74% smaller due to fewer R_X86_64_RELATIVE relocations.
```
% readelf -r pie | awk '$3~/R.*/{s[$3]++} END {for (k in s) print k, s[k]}'
R_X86_64_JUMP_SLO 331
R_X86_64_TPOFF64 2
R_X86_64_RELATIVE 476059 # was: 607712
R_X86_64_64 2616
R_X86_64_GLOB_DAT 31
```
The absolute function address (used by llvm-profdata to collect indirect call
targets) can be converted to relative as well, but is not done in this patch.
Differential Revision: https://reviews.llvm.org/D104556
36 lines
1.2 KiB
LLVM
36 lines
1.2 KiB
LLVM
; RUN: opt < %s -pgo-instr-gen -instrprof -S | FileCheck %s
|
|
; RUN: opt <%s -passes=pgo-instr-gen,instrprof -S | FileCheck %s
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64-unknown-linux-gnu"
|
|
|
|
define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
|
|
entry:
|
|
br label %for.cond
|
|
|
|
for.cond:
|
|
%i.0 = phi i32 [ 0, %entry ], [ %add, %for.cond1 ]
|
|
%cmp = icmp slt i32 %i.0, %n
|
|
br i1 %cmp, label %for.cond1, label %for.end6
|
|
|
|
for.cond1:
|
|
%j.0 = phi i32 [ %inc, %for.body3 ], [ 0, %for.cond ]
|
|
%idx.ext = sext i32 %i.0 to i64
|
|
%add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
|
|
%0 = load i32, i32* %add.ptr, align 4
|
|
%cmp2 = icmp slt i32 %j.0, %0
|
|
%add = add nsw i32 %i.0, 1
|
|
br i1 %cmp2, label %for.body3, label %for.cond
|
|
|
|
for.body3:
|
|
%conv = sext i32 %add to i64
|
|
; CHECK: call void @__llvm_profile_instrument_memop(i64 %conv, i8* bitcast ({ i64, i64, i64, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0)
|
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false)
|
|
%inc = add nsw i32 %j.0, 1
|
|
br label %for.cond1
|
|
|
|
for.end6:
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)
|