llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp

//===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The data structures defined in this file are based on the reference
// implementation which is available at
// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
//
//===----------------------------------------------------------------------===//

#include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"

#include "llvm/ADT/DenseSet.h"
#include "llvm/DebugInfo/CodeView/RecordName.h"
#include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
#include "llvm/DebugInfo/MSF/MSFBuilder.h"
#include "llvm/DebugInfo/MSF/MSFCommon.h"
#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
#include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
#include "llvm/DebugInfo/PDB/Native/Hash.h"
#include "llvm/Support/BinaryItemStream.h"
#include "llvm/Support/BinaryStreamWriter.h"
#include "llvm/Support/Parallel.h"
#include "llvm/Support/xxhash.h"
#include <algorithm>
#include <vector>

using namespace llvm;
using namespace llvm::msf;
using namespace llvm::pdb;
using namespace llvm::codeview;

struct llvm::pdb::GSIHashStreamBuilder {
  struct SymbolDenseMapInfo {
    static inline CVSymbol getEmptyKey() {
      static CVSymbol Empty;
      return Empty;
    }
    static inline CVSymbol getTombstoneKey() {
      static CVSymbol Tombstone(
          DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
      return Tombstone;
    }
    static unsigned getHashValue(const CVSymbol &Val) {
      return xxHash64(Val.RecordData);
    }
    static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
      return LHS.RecordData == RHS.RecordData;
    }
  };

  std::vector<CVSymbol> Records;
  llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
  std::vector<PSHashRecord> HashRecords;
  std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
  std::vector<support::ulittle32_t> HashBuckets;

  uint32_t calculateSerializedLength() const;
  uint32_t calculateRecordByteSize() const;
  Error commit(BinaryStreamWriter &Writer);

  void finalizeBuckets(uint32_t RecordZeroOffset);

  // Finalize public symbol buckets.
  void finalizeBuckets(uint32_t RecordZeroOffset,
                       std::vector<BulkPublic> &&Publics);

  template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
    T Copy(Symbol);
    addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
                                               CodeViewContainer::Pdb));
  }
  void addSymbol(const CVSymbol &Symbol);
};

void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) {
  // Ignore duplicate typedefs and constants.
  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
    auto Iter = SymbolHashes.insert(Symbol);
    if (!Iter.second)
      return;
  }
  Records.push_back(Symbol);
}

namespace {
LLVM_PACKED_START
struct PublicSym32Layout {
  RecordPrefix Prefix;
  PublicSym32Header Pub;
  // char Name[];
};
LLVM_PACKED_END
} // namespace

// Calculate how much memory this public needs when serialized.
static uint32_t sizeOfPublic(const BulkPublic &Pub) {
  uint32_t NameLen = Pub.NameLen;
  NameLen = std::min(NameLen,
                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
}

static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
  // Assume the caller has allocated sizeOfPublic bytes.
  uint32_t NameLen = std::min(
      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
  assert(Size == sizeOfPublic(Pub));
  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
  FixedMem->Pub.Flags = Pub.Flags;
  FixedMem->Pub.Offset = Pub.Offset;
  FixedMem->Pub.Segment = Pub.U.Segment;
  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
  memcpy(NameMem, Pub.Name, NameLen);
  // Zero the null terminator and remaining bytes.
  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
  return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
}

uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
  uint32_t Size = sizeof(GSIHashHeader);
  Size += HashRecords.size() * sizeof(PSHashRecord);
  Size += HashBitmap.size() * sizeof(uint32_t);
  Size += HashBuckets.size() * sizeof(uint32_t);
  return Size;
}

uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const {
  uint32_t Size = 0;
  for (const auto &Sym : Records)
    Size += Sym.length();
  return Size;
}

Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
  GSIHashHeader Header;
  Header.VerSignature = GSIHashHeader::HdrSignature;
  Header.VerHdr = GSIHashHeader::HdrVersion;
  Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
  Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;

  if (auto EC = Writer.writeObject(Header))
    return EC;

  if (auto EC = Writer.writeArray(makeArrayRef(HashRecords)))
    return EC;
  if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap)))
    return EC;
  if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets)))
    return EC;
  return Error::success();
}

static bool isAsciiString(StringRef S) {
  return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
}

// See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
static int gsiRecordCmp(StringRef S1, StringRef S2) {
  size_t LS = S1.size();
  size_t RS = S2.size();
  // Shorter strings always compare less than longer strings.
  if (LS != RS)
    return LS - RS;

  // If either string contains non ascii characters, memcmp them.
  if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
    return memcmp(S1.data(), S2.data(), LS);

  // Both strings are ascii, perform a case-insenstive comparison.
  return S1.compare_lower(S2.data());
}

void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
  // Build up a list of globals to be bucketed. This repurposes the BulkPublic
  // struct with different meanings for the fields to avoid reallocating a new
  // vector during public symbol table hash construction.
  std::vector<BulkPublic> Globals;
  Globals.resize(Records.size());
  uint32_t SymOffset = RecordZeroOffset;
  for (size_t I = 0, E = Records.size(); I < E; ++I) {
    StringRef Name = getSymbolName(Records[I]);
    Globals[I].Name = Name.data();
    Globals[I].NameLen = Name.size();
    Globals[I].SymOffset = SymOffset;
    SymOffset += Records[I].length();
  }

  finalizeBuckets(RecordZeroOffset, std::move(Globals));
}

void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset,
                                           std::vector<BulkPublic> &&Globals) {
  // Hash every name in parallel. The Segment field is no longer needed, so
  // store the BucketIdx in a union.
  parallelForEachN(0, Globals.size(), [&](size_t I) {
    Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH;
  });

  // Parallel sort by bucket index, then name within the buckets. Within the
  // buckets, sort each bucket by memcmp of the symbol's name.  It's important
  // that we use the same sorting algorithm as is used by the reference
  // implementation to ensure that the search for a record within a bucket can
  // properly early-out when it detects the record won't be found.  The
  // algorithm used here corredsponds to the function
  // caseInsensitiveComparePchPchCchCch in the reference implementation.
  auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) {
    if (L.U.BucketIdx != R.U.BucketIdx)
      return L.U.BucketIdx < R.U.BucketIdx;
    int Cmp = gsiRecordCmp(L.getName(), R.getName());
    if (Cmp != 0)
      return Cmp < 0;
    // This comparison is necessary to make the sorting stable in the presence
    // of two static globals with the same name. The easiest way to observe
    // this is with S_LDATA32 records.
    return L.SymOffset < R.SymOffset;
  };
  parallelSort(Globals, BucketCmp);

  // Zero out the bucket index bitmap.
  for (ulittle32_t &Word : HashBitmap)
    Word = 0;

  // Compute the three tables: the hash records in bucket and chain order, the
  // bucket presence bitmap, and the bucket chain start offsets.
  HashRecords.reserve(Globals.size());
  uint32_t LastBucketIdx = ~0U;
  for (const BulkPublic &Global : Globals) {
    // If this is a new bucket, add it to the bitmap and the start offset map.
    uint32_t BucketIdx = Global.U.BucketIdx;
    if (LastBucketIdx != BucketIdx) {
      HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);

      // Calculate what the offset of the first hash record in the chain would
      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
      const int SizeOfHROffsetCalc = 12;
      ulittle32_t ChainStartOff =
          ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
      HashBuckets.push_back(ChainStartOff);
      LastBucketIdx = BucketIdx;
    }

    // Create the hash record. Add one when writing symbol offsets to disk.
    // See GSI1::fixSymRecs. Always use a refcount of 1 for now.
    PSHashRecord HRec;
    HRec.Off = Global.SymOffset + 1;
    HRec.CRef = 1;
    HashRecords.push_back(HRec);
  }
}

GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
    : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
      GSH(std::make_unique<GSIHashStreamBuilder>()) {}

GSIStreamBuilder::~GSIStreamBuilder() {}

uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
  uint32_t Size = 0;
  Size += sizeof(PublicsStreamHeader);
  Size += PSH->calculateSerializedLength();
  Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap
  // FIXME: Add thunk map and section offsets for incremental linking.

  return Size;
}

uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
  return GSH->calculateSerializedLength();
}

Error GSIStreamBuilder::finalizeMsfLayout() {
  // First we write public symbol records, then we write global symbol records.
  uint32_t PublicsSize = PSH->calculateRecordByteSize();
  uint32_t GlobalsSize = GSH->calculateRecordByteSize();
  GSH->finalizeBuckets(PublicsSize);

  Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
  if (!Idx)
    return Idx.takeError();
  GlobalsStreamIndex = *Idx;

  Idx = Msf.addStream(calculatePublicsHashStreamSize());
  if (!Idx)
    return Idx.takeError();
  PublicsStreamIndex = *Idx;

  uint32_t RecordBytes = PublicsSize + GlobalsSize;

  Idx = Msf.addStream(RecordBytes);
  if (!Idx)
    return Idx.takeError();
  RecordStreamIndex = *Idx;
  return Error::success();
}

void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&Publics) {
  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
    return L.getName() < R.getName();
  });

  // Assign offsets and allocate one contiguous block of memory for all public
  // symbols.
  uint32_t SymOffset = 0;
  for (BulkPublic &Pub : Publics) {
    Pub.SymOffset = SymOffset;
    SymOffset += sizeOfPublic(Pub);
  }
  uint8_t *Mem =
      reinterpret_cast<uint8_t *>(Msf.getAllocator().Allocate(SymOffset, 4));

  // Instead of storing individual CVSymbol records, store them as one giant
  // buffer.
  // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have
  // to account for that elsewhere.
  PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset)));

  // Serialize them in parallel.
  parallelForEachN(0, Publics.size(), [&](size_t I) {
    const BulkPublic &Pub = Publics[I];
    serializePublic(Mem + Pub.SymOffset, Pub);
  });

  // Re-sort the publics by address so we can build the address map. We no
  // longer need the original ordering.
  auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) {
    if (L.U.Segment != R.U.Segment)
      return L.U.Segment < R.U.Segment;
    if (L.Offset != R.Offset)
      return L.Offset < R.Offset;
    // parallelSort is unstable, so we have to do name comparison to ensure
    // that two names for the same location come out in a determinstic order.
    return L.getName() < R.getName();
  };
  parallelSort(Publics, AddrCmp);

  // Fill in the symbol offsets in the appropriate order.
  PubAddrMap.reserve(Publics.size());
  for (const BulkPublic &Pub : Publics)
    PubAddrMap.push_back(ulittle32_t(Pub.SymOffset));

  // Finalize public symbol buckets immediately after they have been added.
  // They should all be warm in the cache at this point, so go ahead and do it
  // now.
  PSH->finalizeBuckets(0, std::move(Publics));
}

void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
  GSH->addSymbol(Sym, Msf);
}

void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
  GSH->addSymbol(Sym, Msf);
}

void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
  GSH->addSymbol(Sym, Msf);
}

void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
  GSH->addSymbol(Sym);
}

static Error writeRecords(BinaryStreamWriter &Writer,
                          ArrayRef<CVSymbol> Records) {
  BinaryItemStream<CVSymbol> ItemStream(support::endianness::little);
  ItemStream.setItems(Records);
  BinaryStreamRef RecordsRef(ItemStream);
  return Writer.writeStreamRef(RecordsRef);
}

Error GSIStreamBuilder::commitSymbolRecordStream(
    WritableBinaryStreamRef Stream) {
  BinaryStreamWriter Writer(Stream);

  // Write public symbol records first, followed by global symbol records.  This
  // must match the order that we assume in finalizeMsfLayout when computing
  // PSHZero and GSHZero.
  if (auto EC = writeRecords(Writer, PSH->Records))
    return EC;
  if (auto EC = writeRecords(Writer, GSH->Records))
    return EC;

  return Error::success();
}

Error GSIStreamBuilder::commitPublicsHashStream(
    WritableBinaryStreamRef Stream) {
  BinaryStreamWriter Writer(Stream);
  PublicsStreamHeader Header;

  // FIXME: Fill these in. They are for incremental linking.
  Header.SymHash = PSH->calculateSerializedLength();
  Header.AddrMap = PubAddrMap.size() * 4;
  Header.NumThunks = 0;
  Header.SizeOfThunk = 0;
  Header.ISectThunkTable = 0;
  memset(Header.Padding, 0, sizeof(Header.Padding));
  Header.OffThunkTable = 0;
  Header.NumSections = 0;
  if (auto EC = Writer.writeObject(Header))
    return EC;

  if (auto EC = PSH->commit(Writer))
    return EC;

  if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
    return EC;

  return Error::success();
}

Error GSIStreamBuilder::commitGlobalsHashStream(
    WritableBinaryStreamRef Stream) {
  BinaryStreamWriter Writer(Stream);
  return GSH->commit(Writer);
}

Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
                               WritableBinaryStreamRef Buffer) {
  auto GS = WritableMappedBlockStream::createIndexedStream(
      Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
  auto PS = WritableMappedBlockStream::createIndexedStream(
      Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
  auto PRS = WritableMappedBlockStream::createIndexedStream(
      Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());

  if (auto EC = commitSymbolRecordStream(*PRS))
    return EC;
  if (auto EC = commitGlobalsHashStream(*GS))
    return EC;
  if (auto EC = commitPublicsHashStream(*PS))
    return EC;
  return Error::success();
}