[lld-macho] Parallelize linker optimization hint processing

This commit moves the parsing of linker optimization hints into `ARM64::applyOptimizationHints`. This lets us avoid allocating memory for holding the parsed information, and moves work out of `ObjFile::parse`, which is not parallelized at the moment. This change reduces the overhead of processing LOHs to 25-30 ms when linking Chromium Framework on my M1 machine; previously it took close to 100 ms. There's no statistically significant change in runtime for a --threads=1 link. Performance figures with all 8 cores utilized: N Min Max Median Avg Stddev x 20 3.8027232 3.8760762 3.8505335 3.8454145 0.026352574 + 20 3.7019017 3.8660538 3.7546209 3.7620371 0.032680043 Difference at 95.0% confidence -0.0833775 +/- 0.019 -2.16823% +/- 0.494094% (Student's t, pooled s = 0.0296854) Differential Revision: https://reviews.llvm.org/D133439
2022-09-05 19:03:15 +02:00 · 2022-09-05 19:03:15 +02:00 · a8843ec952
commit a8843ec952
parent 396ed327bb
10 changed files with 211 additions and 254 deletions
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@ -18,6 +18,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"

 using namespace llvm;
@ -40,8 +41,7 @@ struct ARM64 : ARM64Common {
                            uint64_t selectorIndex, uint64_t gotAddr,
                            uint64_t msgSendIndex) const override;
  void populateThunk(InputSection *thunk, Symbol *funcSym) override;
-  void applyOptimizationHints(uint8_t *,
-                              const ConcatInputSection *) const override;
+  void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
 };

 } // namespace
@ -196,23 +196,6 @@ struct Ldr {
  ExtendType extendType;
  int64_t offset;
 };
-
-class OptimizationHintContext {
-public:
-  OptimizationHintContext(uint8_t *buf, const ConcatInputSection *isec)
-      : buf(buf), isec(isec) {}
-
-  void applyAdrpAdd(uint64_t, uint64_t);
-  void applyAdrpAdrp(uint64_t, uint64_t);
-  void applyAdrpLdr(uint64_t, uint64_t);
-  void applyAdrpLdrGot(uint64_t, uint64_t);
-  void applyAdrpAddLdr(uint64_t, uint64_t, uint64_t);
-  void applyAdrpLdrGotLdr(uint64_t, uint64_t, uint64_t);
-
-private:
-  uint8_t *buf;
-  const ConcatInputSection *isec;
-};
 } // namespace

 static bool parseAdrp(uint32_t insn, Adrp &adrp) {
@ -347,7 +330,8 @@ static void writeImmediateLdr(void *loc, const Ldr &ldr) {
 // ->
 //   adr  xM, _foo
 //   nop
-void OptimizationHintContext::applyAdrpAdd(uint64_t offset1, uint64_t offset2) {
+static void applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
  uint32_t ins1 = read32le(buf + offset1);
  uint32_t ins2 = read32le(buf + offset2);
  Adrp adrp;
@ -375,8 +359,8 @@ void OptimizationHintContext::applyAdrpAdd(uint64_t offset1, uint64_t offset2) {
 // ->
 //   adrp xN, _foo@PAGE
 //   nop
-void OptimizationHintContext::applyAdrpAdrp(uint64_t offset1,
-                                            uint64_t offset2) {
+static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec,
+                          uint64_t offset1, uint64_t offset2) {
  uint32_t ins1 = read32le(buf + offset1);
  uint32_t ins2 = read32le(buf + offset2);
  Adrp adrp1, adrp2;
@ -402,7 +386,8 @@ void OptimizationHintContext::applyAdrpAdrp(uint64_t offset1,
 // ->
 //   nop
 //   ldr  xM, _foo
-void OptimizationHintContext::applyAdrpLdr(uint64_t offset1, uint64_t offset2) {
+static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec,
+                         uint64_t offset1, uint64_t offset2) {
  uint32_t ins1 = read32le(buf + offset1);
  uint32_t ins2 = read32le(buf + offset2);
  Adrp adrp;
@ -426,15 +411,15 @@ void OptimizationHintContext::applyAdrpLdr(uint64_t offset1, uint64_t offset2) {
 // GOT loads are emitted by the compiler as a pair of adrp and ldr instructions,
 // but they may be changed to adrp+add by relaxGotLoad(). This hint performs
 // the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed.
-void OptimizationHintContext::applyAdrpLdrGot(uint64_t offset1,
-                                              uint64_t offset2) {
+static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2) {
  uint32_t ins2 = read32le(buf + offset2);
  Add add;
  Ldr ldr;
  if (parseAdd(ins2, add))
-    applyAdrpAdd(offset1, offset2);
+    applyAdrpAdd(buf, isec, offset1, offset2);
  else if (parseLdr(ins2, ldr))
-    applyAdrpLdr(offset1, offset2);
+    applyAdrpLdr(buf, isec, offset1, offset2);
 }

 // Optimizes an adrp+add+ldr sequence used for loading from a local symbol's
@ -444,9 +429,9 @@ void OptimizationHintContext::applyAdrpLdrGot(uint64_t offset1,
 //   adrp x0, _foo@PAGE
 //   add  x1, x0, _foo@PAGEOFF
 //   ldr  x2, [x1, #off]
-void OptimizationHintContext::applyAdrpAddLdr(uint64_t offset1,
-                                              uint64_t offset2,
-                                              uint64_t offset3) {
+static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec,
+                            uint64_t offset1, uint64_t offset2,
+                            uint64_t offset3) {
  uint32_t ins1 = read32le(buf + offset1);
  Adrp adrp;
  if (!parseAdrp(ins1, adrp))
@ -512,15 +497,15 @@ void OptimizationHintContext::applyAdrpAddLdr(uint64_t offset1,
 // the GOT entry can be loaded with a single literal ldr instruction.
 // If the referenced symbol is local and thus has been relaxed to adrp+add+ldr,
 // we perform the AdrpAddLdr transformation.
-void OptimizationHintContext::applyAdrpLdrGotLdr(uint64_t offset1,
-                                                 uint64_t offset2,
-                                                 uint64_t offset3) {
+static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec,
+                               uint64_t offset1, uint64_t offset2,
+                               uint64_t offset3) {
  uint32_t ins2 = read32le(buf + offset2);
  Add add;
  Ldr ldr2;

  if (parseAdd(ins2, add)) {
-    applyAdrpAddLdr(offset1, offset2, offset3);
+    applyAdrpAddLdr(buf, isec, offset1, offset2, offset3);
  } else if (parseLdr(ins2, ldr2)) {
    // adrp x1, _foo@GOTPAGE
    // ldr  x2, [x1, _foo@GOTPAGEOFF]
@ -559,47 +544,167 @@ void OptimizationHintContext::applyAdrpLdrGotLdr(uint64_t offset1,
  }
 }

-void ARM64::applyOptimizationHints(uint8_t *buf,
-                                   const ConcatInputSection *isec) const {
-  assert(isec);
+static uint64_t readValue(const uint8_t *&ptr, const uint8_t *end) {
+  unsigned int n = 0;
+  uint64_t value = decodeULEB128(ptr, &n, end);
+  ptr += n;
+  return value;
+}

-  // Note: Some of these optimizations might not be valid when shared regions
-  // are in use. Will need to revisit this if splitSegInfo is added.
+template <typename Callback>
+static void forEachHint(ArrayRef<uint8_t> data, Callback callback) {
+  std::array<uint64_t, 3> args;

-  OptimizationHintContext ctx(buf, isec);
-  for (const OptimizationHint &hint : isec->optimizationHints) {
-    switch (hint.type) {
-    case LOH_ARM64_ADRP_ADRP:
-      // This is done in another pass because the other optimization hints
-      // might cause its targets to be turned into NOPs.
+  for (const uint8_t *p = data.begin(), *end = data.end(); p < end;) {
+    uint64_t type = readValue(p, end);
+    if (type == 0)
+      break;
+
+    uint64_t argCount = readValue(p, end);
+    // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others.
+    if (argCount > 3) {
+      for (unsigned i = 0; i < argCount; ++i)
+        readValue(p, end);
+      continue;
+    }
+
+    for (unsigned i = 0; i < argCount; ++i)
+      args[i] = readValue(p, end);
+    callback(type, ArrayRef<uint64_t>(args.data(), argCount));
+  }
+}
+
+// On RISC architectures like arm64, materializing a memory address generally
+// takes multiple instructions. If the referenced symbol is located close enough
+// in memory, fewer instructions are needed.
+//
+// Linker optimization hints record where addresses are computed. After
+// addresses have been assigned, if possible, we change them to a shorter
+// sequence of instructions. The size of the binary is not modified; the
+// eliminated instructions are replaced with NOPs. This still leads to faster
+// code as the CPU can skip over NOPs quickly.
+//
+// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which
+// points to a sequence of ULEB128-encoded numbers. Each entry specifies a
+// transformation kind, and 2 or 3 addresses where the instructions are located.
+void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const {
+  ArrayRef<uint8_t> data = obj.getOptimizationHints();
+  if (data.empty())
+    return;
+
+  const ConcatInputSection *section = nullptr;
+  uint64_t sectionAddr = 0;
+  uint8_t *buf = nullptr;
+
+  auto findSection = [&](uint64_t addr) {
+    if (section && addr >= sectionAddr &&
+        addr < sectionAddr + section->getSize())
+      return true;
+
+    auto secIt = std::prev(llvm::upper_bound(
+        obj.sections, addr,
+        [](uint64_t off, const Section *sec) { return off < sec->addr; }));
+    const Section *sec = *secIt;
+
+    auto subsecIt = std::prev(llvm::upper_bound(
+        sec->subsections, addr - sec->addr,
+        [](uint64_t off, Subsection subsec) { return off < subsec.offset; }));
+    const Subsection &subsec = *subsecIt;
+    const ConcatInputSection *isec =
+        dyn_cast_or_null<ConcatInputSection>(subsec.isec);
+    if (!isec || isec->shouldOmitFromOutput())
+      return false;
+
+    section = isec;
+    sectionAddr = subsec.offset + sec->addr;
+    buf = outBuf + section->outSecOff + section->parent->fileOff;
+    return true;
+  };
+
+  auto isValidOffset = [&](uint64_t offset) {
+    if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) {
+      error("linker optimization hint spans multiple sections");
+      return false;
+    }
+    return true;
+  };
+
+  bool hasAdrpAdrp = false;
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind == LOH_ARM64_ADRP_ADRP) {
+      hasAdrpAdrp = true;
+      return;
+    }
+
+    if (!findSection(args[0]))
+      return;
+    switch (kind) {
+    case LOH_ARM64_ADRP_ADD:
+      if (isValidOffset(args[1]))
+        applyAdrpAdd(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
      break;
    case LOH_ARM64_ADRP_LDR:
-      ctx.applyAdrpLdr(hint.offset0, hint.offset0 + hint.delta[0]);
+      if (isValidOffset(args[1]))
+        applyAdrpLdr(buf, section, args[0] - sectionAddr,
+                     args[1] - sectionAddr);
+      break;
+    case LOH_ARM64_ADRP_LDR_GOT:
+      if (isValidOffset(args[1]))
+        applyAdrpLdrGot(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr);
      break;
    case LOH_ARM64_ADRP_ADD_LDR:
-      ctx.applyAdrpAddLdr(hint.offset0, hint.offset0 + hint.delta[0],
-                          hint.offset0 + hint.delta[1]);
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpAddLdr(buf, section, args[0] - sectionAddr,
+                        args[1] - sectionAddr, args[2] - sectionAddr);
      break;
    case LOH_ARM64_ADRP_LDR_GOT_LDR:
-      ctx.applyAdrpLdrGotLdr(hint.offset0, hint.offset0 + hint.delta[0],
-                             hint.offset0 + hint.delta[1]);
+      if (isValidOffset(args[1]) && isValidOffset(args[2]))
+        applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr,
+                           args[1] - sectionAddr, args[2] - sectionAddr);
      break;
    case LOH_ARM64_ADRP_ADD_STR:
    case LOH_ARM64_ADRP_LDR_GOT_STR:
      // TODO: Implement these
      break;
-    case LOH_ARM64_ADRP_ADD:
-      ctx.applyAdrpAdd(hint.offset0, hint.offset0 + hint.delta[0]);
-      break;
-    case LOH_ARM64_ADRP_LDR_GOT:
-      ctx.applyAdrpLdrGot(hint.offset0, hint.offset0 + hint.delta[0]);
-      break;
    }
-  }
+  });

-  for (const OptimizationHint &hint : isec->optimizationHints)
-    if (hint.type == LOH_ARM64_ADRP_ADRP)
-      ctx.applyAdrpAdrp(hint.offset0, hint.offset0 + hint.delta[0]);
+  if (!hasAdrpAdrp)
+    return;
+
+  // AdrpAdrp optimization hints are performed in a second pass because they
+  // might interfere with other transformations. For instance, consider the
+  // following input:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   adrp x0, _bar@PAGE
+  //   add  x2, x0, _bar@PAGEOFF
+  //
+  // If we perform the AdrpAdrp relaxation first, we get:
+  //
+  //   adrp x0, _foo@PAGE
+  //   add  x1, x0, _foo@PAGEOFF
+  //   nop
+  //   add x2, x0, _bar@PAGEOFF
+  //
+  // If we then apply AdrpAdd to the first two instructions, the add will have a
+  // garbage value in x0:
+  //
+  //   adr  x1, _foo
+  //   nop
+  //   nop
+  //   add  x2, x0, _bar@PAGEOFF
+  forEachHint(data, [&](uint64_t kind, ArrayRef<uint64_t> args) {
+    if (kind != LOH_ARM64_ADRP_ADRP)
+      return;
+    if (!findSection(args[0]))
+      return;
+    if (isValidOffset(args[1]))
+      applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr);
+  });
 }

 TargetInfo *macho::createARM64TargetInfo() {
--- a/lld/MachO/InputFiles.cpp
+++ b/lld/MachO/InputFiles.cpp
@ -463,155 +463,6 @@ static Defined *findSymbolAtOffset(const ConcatInputSection *isec,
  return *it;
 }

-// Linker optimization hints mark a sequence of instructions used for
-// synthesizing an address which that be transformed into a faster sequence. The
-// transformations depend on conditions that are determined at link time, like
-// the distance to the referenced symbol or its alignment.
-//
-// Each hint has a type and refers to 2 or 3 instructions. Each of those
-// instructions must have a corresponding relocation. After addresses have been
-// finalized and relocations have been performed, we check if the requirements
-// hold, and perform the optimizations if they do.
-//
-// Similar linker relaxations exist for ELF as well, with the difference being
-// that the explicit marking allows for the relaxation of non-consecutive
-// relocations too.
-//
-// The specific types of hints are documented in Arch/ARM64.cpp
-void ObjFile::parseOptimizationHints(ArrayRef<uint8_t> data) {
-  auto expectedArgCount = [](uint8_t type) {
-    switch (type) {
-    case LOH_ARM64_ADRP_ADRP:
-    case LOH_ARM64_ADRP_LDR:
-    case LOH_ARM64_ADRP_ADD:
-    case LOH_ARM64_ADRP_LDR_GOT:
-      return 2;
-    case LOH_ARM64_ADRP_ADD_LDR:
-    case LOH_ARM64_ADRP_ADD_STR:
-    case LOH_ARM64_ADRP_LDR_GOT_LDR:
-    case LOH_ARM64_ADRP_LDR_GOT_STR:
-      return 3;
-    }
-    return -1;
-  };
-
-  // Each hint contains at least 4 ULEB128-encoded fields, so in the worst case,
-  // there are data.size() / 4 LOHs. It's a huge overestimation though, as
-  // offsets are unlikely to fall in the 0-127 byte range, so we pre-allocate
-  // half as much.
-  optimizationHints.reserve(data.size() / 8);
-
-  for (const uint8_t *p = data.begin(); p < data.end();) {
-    const ptrdiff_t inputOffset = p - data.begin();
-    unsigned int n = 0;
-    uint8_t type = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    // An entry of type 0 terminates the list.
-    if (type == 0)
-      break;
-
-    int expectedCount = expectedArgCount(type);
-    if (LLVM_UNLIKELY(expectedCount == -1)) {
-      error("Linker optimization hint at offset " + Twine(inputOffset) +
-            " has unknown type " + Twine(type));
-      return;
-    }
-
-    uint8_t argCount = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    if (LLVM_UNLIKELY(argCount != expectedCount)) {
-      error("Linker optimization hint at offset " + Twine(inputOffset) +
-            " has " + Twine(argCount) + " arguments instead of the expected " +
-            Twine(expectedCount));
-      return;
-    }
-
-    uint64_t offset0 = decodeULEB128(p, &n, data.end());
-    p += n;
-
-    int16_t delta[2];
-    for (int i = 0; i < argCount - 1; ++i) {
-      uint64_t address = decodeULEB128(p, &n, data.end());
-      p += n;
-      int64_t d = address - offset0;
-      if (LLVM_UNLIKELY(d > std::numeric_limits<int16_t>::max() ||
-                        d < std::numeric_limits<int16_t>::min())) {
-        error("Linker optimization hint at offset " + Twine(inputOffset) +
-              " has addresses too far apart");
-        return;
-      }
-      delta[i] = d;
-    }
-
-    optimizationHints.push_back({offset0, {delta[0], delta[1]}, type});
-  }
-
-  // We sort the per-object vector of optimization hints so each section only
-  // needs to hold an ArrayRef to a contiguous range of hints.
-  llvm::sort(optimizationHints,
-             [](const OptimizationHint &a, const OptimizationHint &b) {
-               return a.offset0 < b.offset0;
-             });
-
-  auto section = sections.begin();
-  auto subsection = (*section)->subsections.begin();
-  uint64_t subsectionBase = 0;
-  uint64_t subsectionEnd = 0;
-
-  auto updateAddr = [&]() {
-    subsectionBase = (*section)->addr + subsection->offset;
-    subsectionEnd = subsectionBase + subsection->isec->getSize();
-  };
-
-  auto advanceSubsection = [&]() {
-    if (section == sections.end())
-      return;
-    ++subsection;
-    while (subsection == (*section)->subsections.end()) {
-      ++section;
-      if (section == sections.end())
-        return;
-      subsection = (*section)->subsections.begin();
-    }
-  };
-
-  updateAddr();
-  auto hintStart = optimizationHints.begin();
-  for (auto hintEnd = hintStart, end = optimizationHints.end(); hintEnd != end;
-       ++hintEnd) {
-    if (hintEnd->offset0 >= subsectionEnd) {
-      subsection->isec->optimizationHints =
-          ArrayRef<OptimizationHint>(&*hintStart, hintEnd - hintStart);
-
-      hintStart = hintEnd;
-      while (hintStart->offset0 >= subsectionEnd) {
-        advanceSubsection();
-        if (section == sections.end())
-          break;
-        updateAddr();
-        assert(hintStart->offset0 >= subsectionBase);
-      }
-    }
-
-    hintEnd->offset0 -= subsectionBase;
-    for (int i = 0, count = expectedArgCount(hintEnd->type); i < count - 1;
-         ++i) {
-      if (LLVM_UNLIKELY(
-              hintEnd->delta[i] < -static_cast<int64_t>(hintEnd->offset0) ||
-              hintEnd->delta[i] >=
-                  static_cast<int64_t>(subsectionEnd - hintEnd->offset0))) {
-        error("Linker optimization hint spans multiple sections");
-        return;
-      }
-    }
-  }
-  if (section != sections.end())
-    subsection->isec->optimizationHints = ArrayRef<OptimizationHint>(
-        &*hintStart, optimizationHints.end() - hintStart);
-}
-
 template <class SectionHeader>
 static bool validateRelocationInfo(InputFile *file, const SectionHeader &sec,
                                   relocation_info rel) {
@ -1129,11 +980,6 @@ template <class LP> void ObjFile::parse() {
    if (!sections[i]->subsections.empty())
      parseRelocations(sectionHeaders, sectionHeaders[i], *sections[i]);

-  if (!config->ignoreOptimizationHints)
-    if (auto *cmd = findCommand<linkedit_data_command>(
-            hdr, LC_LINKER_OPTIMIZATION_HINT))
-      parseOptimizationHints({buf + cmd->dataoff, cmd->datasize});
-
  parseDebugInfo();

  Section *ehFrameSection = nullptr;
@ -1213,6 +1059,14 @@ ArrayRef<data_in_code_entry> ObjFile::getDataInCode() const {
          c->datasize / sizeof(data_in_code_entry)};
 }

+ArrayRef<uint8_t> ObjFile::getOptimizationHints() const {
+  const auto *buf = reinterpret_cast<const uint8_t *>(mb.getBufferStart());
+  if (auto *cmd =
+          findCommand<linkedit_data_command>(buf, LC_LINKER_OPTIMIZATION_HINT))
+    return {buf + cmd->dataoff, cmd->datasize};
+  return {};
+}
+
 // Create pointers from symbols to their associated compact unwind entries.
 void ObjFile::registerCompactUnwind(Section &compactUnwindSection) {
  for (const Subsection &subsection : compactUnwindSection.subsections) {
--- a/lld/MachO/InputFiles.h
+++ b/lld/MachO/InputFiles.h
@ -159,6 +159,7 @@ public:
  ObjFile(MemoryBufferRef mb, uint32_t modTime, StringRef archiveName,
          bool lazy = false, bool forceHidden = false);
  ArrayRef<llvm::MachO::data_in_code_entry> getDataInCode() const;
+  ArrayRef<uint8_t> getOptimizationHints() const;
  template <class LP> void parse();

  static bool classof(const InputFile *f) { return f->kind() == ObjKind; }
@ -176,7 +177,6 @@ public:
  std::vector<ConcatInputSection *> debugSections;
  std::vector<CallGraphEntry> callGraph;
  llvm::DenseMap<ConcatInputSection *, FDE> fdes;
-  std::vector<OptimizationHint> optimizationHints;
  std::vector<AliasSymbol *> aliases;

 private:
@ -193,7 +193,6 @@ private:
  void parseRelocations(ArrayRef<SectionHeader> sectionHeaders,
                        const SectionHeader &, Section &);
  void parseDebugInfo();
-  void parseOptimizationHints(ArrayRef<uint8_t> data);
  void splitEhFrames(ArrayRef<uint8_t> dataArr, Section &ehFrameSection);
  void registerCompactUnwind(Section &compactUnwindSection);
  void registerEhFrames(Section &ehFrameSection);
--- a/lld/MachO/InputSection.cpp
+++ b/lld/MachO/InputSection.cpp
@ -29,8 +29,8 @@ using namespace lld::macho;
 // Verify ConcatInputSection's size on 64-bit builds. The size of std::vector
 // can differ based on STL debug levels (e.g. iterator debugging on MSVC's STL),
 // so account for that.
-static_assert(sizeof(void *) != 8 || sizeof(ConcatInputSection) ==
-                                         sizeof(std::vector<Reloc>) + 104,
+static_assert(sizeof(void *) != 8 ||
+                  sizeof(ConcatInputSection) == sizeof(std::vector<Reloc>) + 88,
              "Try to minimize ConcatInputSection's size, we create many "
              "instances of it");

@ -219,8 +219,6 @@ void ConcatInputSection::writeTo(uint8_t *buf) {
    }
    target->relocateOne(loc, r, referentVA, getVA() + r.offset);
  }
-
-  target->applyOptimizationHints(buf, this);
 }

 ConcatInputSection *macho::makeSyntheticInputSection(StringRef segName,
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@ -83,7 +83,6 @@ public:
  OutputSection *parent = nullptr;
  ArrayRef<uint8_t> data;
  std::vector<Reloc> relocs;
-  ArrayRef<OptimizationHint> optimizationHints;
  // The symbols that belong to this InputSection, sorted by value. With
  // .subsections_via_symbols, there is typically only one element here.
  llvm::TinyPtrVector<Defined *> symbols;
--- a/lld/MachO/Relocations.h
+++ b/lld/MachO/Relocations.h
@ -69,14 +69,6 @@ struct Reloc {
        addend(addend), referent(referent) {}
 };

-struct OptimizationHint {
-  // Offset of the first address within the containing InputSection.
-  uint64_t offset0;
-  // Offset of the other addresses relative to the first one.
-  int16_t delta[2];
-  uint8_t type;
-};
-
 bool validateSymbolRelocation(const Symbol *, const InputSection *,
                              const Reloc &);

--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@ -27,7 +27,7 @@ class Symbol;
 class Defined;
 class DylibSymbol;
 class InputSection;
-class ConcatInputSection;
+class ObjFile;

 class TargetInfo {
 public:
@ -97,8 +97,7 @@ public:
    llvm_unreachable("Unsupported architecture for dtrace symbols");
  }

-  virtual void applyOptimizationHints(uint8_t *buf,
-                                      const ConcatInputSection *) const {};
+  virtual void applyOptimizationHints(uint8_t *, const ObjFile &) const {};

  uint32_t magic;
  llvm::MachO::CPUType cpuType;
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@ -60,6 +60,7 @@ public:

  void openFile();
  void writeSections();
+  void applyOptimizationHints();
  void writeUuid();
  void writeCodeSignature();
  void writeOutputFile();
@ -1072,6 +1073,18 @@ void Writer::writeSections() {
  });
 }

+void Writer::applyOptimizationHints() {
+  if (config->arch() != AK_arm64 || config->ignoreOptimizationHints)
+    return;
+
+  uint8_t *buf = buffer->getBufferStart();
+  TimeTraceScope timeScope("Apply linker optimization hints");
+  parallelForEach(inputFiles, [buf](const InputFile *file) {
+    if (const auto *objFile = dyn_cast<ObjFile>(file))
+      target->applyOptimizationHints(buf, *objFile);
+  });
+}
+
 // In order to utilize multiple cores, we first split the buffer into chunks,
 // compute a hash for each chunk, and then compute a hash value of the hash
 // values.
@ -1114,6 +1127,7 @@ void Writer::writeOutputFile() {
  if (errorCount())
    return;
  writeSections();
+  applyOptimizationHints();
  writeUuid();
  writeCodeSignature();

--- a/lld/test/MachO/invalid/invalid-loh.s
+++ b/lld/test/MachO/invalid/invalid-loh.s
@ -1,15 +1,10 @@
 # REQUIRES: aarch64

-# RUN: rm -rf %t; split-file %s %t
-# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/section.s -o %t/section.o
-# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/far.s -o %t/far.o
-# RUN: not %lld -arch arm64 %t/section.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=SECTION
-# RUN: not %lld -arch arm64 %t/far.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=FAR
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o
+# RUN: not %lld -arch arm64 %t.o -o /dev/null 2>&1 | FileCheck %s

-# SECTION: error: Linker optimization hint spans multiple sections
-# FAR:     error: Linker optimization hint at offset 0 has addresses too far apart
+# CHECK: error: linker optimization hint spans multiple sections

-#--- section.s
 .globl _main
 _main:
 L1:
@ -23,17 +18,3 @@ _target:

 .loh AdrpAdd L1, L2
 .subsections_via_symbols
-
-#--- far.s
-.globl _main
-_main:
-L1:
-  adrp x0, _target@PAGE
-  .zero 0x8000
-L2:
-  add  x0, x0, _target@PAGEOFF
-
-_target:
-
-.loh AdrpAdd L1, L2
-.subsections_via_symbols
--- a/lld/test/MachO/loh-adrp-adrp.s
+++ b/lld/test/MachO/loh-adrp-adrp.s
@ -17,6 +17,11 @@
 ## Not an adrp instruction (invalid)
 # CHECK-NEXT: nop
 # CHECK-NEXT: adrp x4
+## Other relaxations take precedence over AdrpAdrp
+# CHECK-NEXT: adr x6
+# CHECK-NEXT: nop
+# CHECK-NEXT: adr x6
+# CHECK-NEXT: nop

 .text
 .align 2
@ -39,6 +44,14 @@ L7:
  nop
 L8:
  adrp x4, _baz@PAGE
+L9:
+  adrp x5, _foo@PAGE
+L10:
+  add  x6, x5, _foo@PAGEOFF
+L11:
+  adrp x5, _bar@PAGE
+L12:
+  add  x6, x5, _bar@PAGEOFF

 .data
 .align 12
@ -54,3 +67,6 @@ _baz:
 .loh AdrpAdrp L3, L4
 .loh AdrpAdrp L5, L6
 .loh AdrpAdrp L7, L8
+.loh AdrpAdrp L9, L11
+.loh AdrpAdd  L9, L10
+.loh AdrpAdd  L11, L12