[ELF] Separate relative and non-relative dynamic relocations (#187959)

Previously, the flow was:

1. Parallel scan adds relative relocs to per-thread `relocsVec`
2. `mergeRels()` copies all into `relocs`
3. `partitionRels()` uses `stable_partition` to separate

Now, relative relocs are routed at `addReloc` time by checking
`reloc.type == relativeRel`. In `mergeRels`, sharded entries are
classified through the same `addReloc` path rather than blindly
appended. `relocsVec` may contain non-relative entries like
`R_AARCH64_AUTH_RELATIVE`.

This eliminates the `stable_partition` on the full relocation vector
(543K entries for clang) and avoids copying relative relocations into
`relocs` only to move them out again.

Linking an x86_64 release+assertions build of clang is 1.04x as fast.

`numRelativeRelocs` caches `relativeRelocs.size()` at `finalizeContents`
time for `DT_RELACOUNT`. Using a live `relativeRelocs.size()` would
cause `DynamicSection::writeTo` to emit an extra entry when thunks add
relocs after `.dynamic` is sized, overflowing into adjacent sections.
Tested by ppc64-long-branch-rel14.s.
This commit is contained in:
Fangrui Song 2026-03-22 18:46:20 -07:00 committed by GitHub
parent 5567572c44
commit 076226f378
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 35 additions and 31 deletions

View File

@ -1483,7 +1483,8 @@ RelocationBaseSection::RelocationBaseSection(Ctx &ctx, StringRef name,
unsigned concurrency)
: SyntheticSection(ctx, name, type, SHF_ALLOC, ctx.arg.wordsize),
dynamicTag(dynamicTag), sizeDynamicTag(sizeDynamicTag),
relocsVec(concurrency), combreloc(combreloc) {}
relocsVec(concurrency), relativeRel(ctx.target->relativeRel),
combreloc(combreloc) {}
void RelocationBaseSection::addSymbolReloc(
RelType dynType, InputSectionBase &isec, uint64_t offsetInSec, Symbol &sym,
@ -1503,29 +1504,24 @@ void RelocationBaseSection::addAddendOnlyRelocIfNonPreemptible(
}
void RelocationBaseSection::mergeRels() {
size_t newSize = relocs.size();
size_t newSize = relativeRelocs.size();
for (const auto &v : relocsVec)
newSize += v.size();
relocs.reserve(newSize);
relativeRelocs.reserve(newSize);
// Classify relocsVec entries into relativeRelocs or relocs. Note that
// relocsVec may contain non-relative entries (e.g. R_AARCH64_AUTH_RELATIVE)
// so we must check the type.
for (const auto &v : relocsVec)
llvm::append_range(relocs, v);
for (const DynamicReloc &r : v)
addReloc(r);
relocsVec.clear();
}
void RelocationBaseSection::partitionRels() {
if (!combreloc)
return;
const RelType relativeRel = ctx.target->relativeRel;
numRelativeRelocs =
std::stable_partition(relocs.begin(), relocs.end(),
[=](auto &r) { return r.type == relativeRel; }) -
relocs.begin();
}
void RelocationBaseSection::finalizeContents() {
mergeRels();
// Compute DT_RELACOUNT to be used by part.dynamic.
partitionRels();
// Cache the count for DT_RELACOUNT. This must not change after
// DynamicSection::finalizeContents sizes the .dynamic section.
numRelativeRelocs = relativeRelocs.size();
SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
// When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE
@ -1551,26 +1547,29 @@ void DynamicReloc::finalize(Ctx &ctx, SymbolTableBaseSection *symt) {
void RelocationBaseSection::computeRels() {
SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
parallelForEach(relativeRelocs, [&ctx = ctx, symTab](DynamicReloc &rel) {
rel.finalize(ctx, symTab);
});
parallelForEach(relocs, [&ctx = ctx, symTab](DynamicReloc &rel) {
rel.finalize(ctx, symTab);
});
// Place IRELATIVE relocations last so that other dynamic relocations are
// applied before IFUNC resolvers run.
auto irelative = std::stable_partition(
relocs.begin() + numRelativeRelocs, relocs.end(),
relocs.begin(), relocs.end(),
[t = ctx.target->iRelativeRel](auto &r) { return r.type != t; });
// Sort by (!IsRelative,SymIndex,r_offset). DT_REL[A]COUNT requires us to
// place R_*_RELATIVE first. SymIndex is to improve locality, while r_offset
// is to make results easier to read.
if (combreloc) {
auto nonRelative = relocs.begin() + numRelativeRelocs;
parallelSort(relocs.begin(), nonRelative,
[&](auto &a, auto &b) { return a.r_offset < b.r_offset; });
// Non-relative relocations are few, so don't bother with parallelSort.
llvm::sort(nonRelative, irelative, [&](auto &a, auto &b) {
parallelSort(relativeRelocs.begin(), relativeRelocs.end(),
[](auto &a, auto &b) { return a.r_offset < b.r_offset; });
// Non-relative relocations are few, so don't bother with parallelSort.
if (combreloc)
llvm::sort(relocs.begin(), irelative, [](auto &a, auto &b) {
return std::tie(a.r_sym, a.r_offset) < std::tie(b.r_sym, b.r_offset);
});
}
}
template <class ELFT>
@ -1585,7 +1584,9 @@ RelocationSection<ELFT>::RelocationSection(Ctx &ctx, StringRef name,
template <class ELFT> void RelocationSection<ELFT>::writeTo(uint8_t *buf) {
computeRels();
for (const DynamicReloc &rel : relocs) {
// Write relative relocations first for DT_REL[A]COUNT.
for (const DynamicReloc &rel :
llvm::concat<const DynamicReloc>(relativeRelocs, relocs)) {
auto *p = reinterpret_cast<Elf_Rela *>(buf);
p->r_offset = rel.r_offset;
p->setSymbolAndType(rel.r_sym, rel.type, ctx.arg.isMips64EL);

View File

@ -495,7 +495,10 @@ public:
/// This overload can be used if the addends are written directly instead of
/// using relocations on the input section (e.g. MipsGotSection::writeTo()).
template <bool shard = false> void addReloc(const DynamicReloc &reloc) {
relocs.push_back(reloc);
if (combreloc && reloc.type == relativeRel)
relativeRelocs.push_back(reloc);
else
relocs.push_back(reloc);
}
/// Add a dynamic relocation against \p sym with an optional addend.
void addSymbolReloc(RelType dynType, InputSectionBase &isec,
@ -532,11 +535,11 @@ public:
{dynType, &sec, offsetInSec, isAgainstSymbol, sym, addend, expr});
}
bool isNeeded() const override {
return !relocs.empty() ||
return !relocs.empty() || !relativeRelocs.empty() ||
llvm::any_of(relocsVec, [](auto &v) { return !v.empty(); });
}
size_t getSize() const override {
size_t count = relocs.size();
size_t count = relocs.size() + relativeRelocs.size();
for (const auto &v : relocsVec)
count += v.size();
return count * this->entsize;
@ -545,16 +548,16 @@ public:
void finalizeContents() override;
int32_t dynamicTag, sizeDynamicTag;
SmallVector<DynamicReloc, 0> relocs;
SmallVector<DynamicReloc, 0> relocs, relativeRelocs;
protected:
void mergeRels();
void partitionRels();
void computeRels();
// Used when parallel relocation scanning adds relocations. The elements
// will be moved into relocs by mergeRel().
// will be classified into relativeRelocs or relocs by mergeRels().
SmallVector<SmallVector<DynamicReloc, 0>, 0> relocsVec;
size_t numRelativeRelocs = 0; // used by -z combreloc
RelType relativeRel;
bool combreloc;
};