Fangrui Song a1532ed275 [InstrProfiling] Make CountersPtr in __profd_ relative
Change `CountersPtr` in `__profd_` to a label difference, which is a link-time
constant. On ELF, when linking a shared object, this requires that `__profc_` is
either private or linkonce/linkonce_odr hidden. On COFF, we need D104564 so that
`.quad a-b` (64-bit label difference) can lower to a 32-bit PC-relative relocation.

```
# ELF: R_X86_64_PC64 (PC-relative)
.quad .L__profc_foo-.L__profd_foo

# Mach-O: a pair of 8-byte X86_64_RELOC_UNSIGNED and X86_64_RELOC_SUBTRACTOR
.quad l___profc_foo-l___profd_foo

# COFF: we actually use IMAGE_REL_AMD64_REL32/IMAGE_REL_ARM64_REL32 so
# the high 32-bit value is zero even if .L__profc_foo < .L__profd_foo
# As compensation, we truncate CountersDelta in the header so that
# __llvm_profile_merge_from_buffer and llvm-profdata reader keep working.
.quad .L__profc_foo-.L__profd_foo
```

(Note: link.exe sorts `.lprfc` before `.lprfd` even if the object writer
has `.lprfd` before `.lprfc`, so we cannot work around by reordering
`.lprfc` and `.lprfd`.)

With this change, a stage 2 (`-DLLVM_TARGETS_TO_BUILD=X86 -DLLVM_BUILD_INSTRUMENTED=IR`)
`ld -pie` linked clang is 1.74% smaller due to fewer R_X86_64_RELATIVE relocations.
```
% readelf -r pie | awk '$3~/R.*/{s[$3]++} END {for (k in s) print k, s[k]}'
R_X86_64_JUMP_SLO 331
R_X86_64_TPOFF64 2
R_X86_64_RELATIVE 476059  # was: 607712
R_X86_64_64 2616
R_X86_64_GLOB_DAT 31
```

The absolute function address (used by llvm-profdata to collect indirect call
targets) can be converted to relative as well, but is not done in this patch.

Differential Revision: https://reviews.llvm.org/D104556
2021-07-30 11:52:18 -07:00

36 lines
1.2 KiB
LLVM

; RUN: opt < %s -pgo-instr-gen -instrprof -S | FileCheck %s
; RUN: opt <%s -passes=pgo-instr-gen,instrprof -S | FileCheck %s
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
entry:
br label %for.cond
for.cond:
%i.0 = phi i32 [ 0, %entry ], [ %add, %for.cond1 ]
%cmp = icmp slt i32 %i.0, %n
br i1 %cmp, label %for.cond1, label %for.end6
for.cond1:
%j.0 = phi i32 [ %inc, %for.body3 ], [ 0, %for.cond ]
%idx.ext = sext i32 %i.0 to i64
%add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
%0 = load i32, i32* %add.ptr, align 4
%cmp2 = icmp slt i32 %j.0, %0
%add = add nsw i32 %i.0, 1
br i1 %cmp2, label %for.body3, label %for.cond
for.body3:
%conv = sext i32 %add to i64
; CHECK: call void @__llvm_profile_instrument_memop(i64 %conv, i8* bitcast ({ i64, i64, i64, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0)
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i1 false)
%inc = add nsw i32 %j.0, 1
br label %for.cond1
for.end6:
ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1)