Reapply "[MemProf] Change histogram storage from uint64_t to uint16_t… (#151431)

Reapply #147854 after fixes merged in #151398.

Change memory access histogram storage from uint64_t to uint16_t to
reduce profile size on disk. This change updates the raw profile format
to v5. Also add a histogram test in compiler-rt since we didn't have one
before. With this change the histogram memprof raw for the basic test
reduces from 75KB -> 20KB.
This commit is contained in:
Snehasish Kumar 2025-07-30 18:28:53 -07:00 committed by GitHub
parent 70471f08ee
commit 08e40c12fa
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
29 changed files with 259 additions and 32 deletions

View File

@ -33,11 +33,10 @@
(uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
// The version number of the raw binary format.
#define MEMPROF_RAW_VERSION 4ULL
#define MEMPROF_RAW_VERSION 5ULL
// Currently supported versions.
#define MEMPROF_RAW_SUPPORTED_VERSIONS \
{ 3ULL, 4ULL }
#define MEMPROF_RAW_SUPPORTED_VERSIONS {3ULL, 4ULL, 5ULL}
#define MEMPROF_V3_MIB_SIZE 132ULL;
@ -229,6 +228,41 @@ void Merge(const MemInfoBlock &newMIB) {
} __attribute__((__packed__));
#endif
constexpr int MantissaBits = 12;
constexpr int ExponentBits = 4;
constexpr uint16_t MaxMantissa = (1U << MantissaBits) - 1;
constexpr uint16_t MaxExponent = (1U << ExponentBits) - 1;
constexpr uint64_t MaxRepresentableValue = static_cast<uint64_t>(MaxMantissa)
<< MaxExponent;
// Encodes a 64-bit unsigned integer into a 16-bit scaled integer format.
inline uint16_t encodeHistogramCount(uint64_t Count) {
if (Count == 0)
return 0;
if (Count > MaxRepresentableValue)
Count = MaxRepresentableValue;
if (Count <= MaxMantissa)
return Count;
uint64_t M = Count;
uint16_t E = 0;
while (M > MaxMantissa) {
M = (M + 1) >> 1;
E++;
}
return (E << MantissaBits) | static_cast<uint16_t>(M);
}
// Decodes a 16-bit scaled integer and returns the
// decoded 64-bit unsigned integer.
inline uint64_t decodeHistogramCount(uint16_t EncodedValue) {
const uint16_t E = EncodedValue >> MantissaBits;
const uint16_t M = EncodedValue & MaxMantissa;
return static_cast<uint64_t>(M) << E;
}
} // namespace memprof
} // namespace llvm

View File

@ -19,6 +19,7 @@ using ::__sanitizer::Vector;
using ::llvm::memprof::MemInfoBlock;
using SegmentEntry = ::llvm::memprof::SegmentEntry;
using Header = ::llvm::memprof::Header;
using ::llvm::memprof::encodeHistogramCount;
namespace {
template <class T> char *WriteBytes(const T &Pod, char *Buffer) {
@ -169,13 +170,15 @@ void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds,
// FIXME: We unnecessarily serialize the AccessHistogram pointer. Adding a
// serialization schema will fix this issue. See also FIXME in
// deserialization.
Ptr = WriteBytes((*h)->mib, Ptr);
for (u64 j = 0; j < (*h)->mib.AccessHistogramSize; ++j) {
u64 HistogramEntry = ((u64 *)((*h)->mib.AccessHistogram))[j];
auto &MIB = (*h)->mib;
Ptr = WriteBytes(MIB, Ptr);
for (u64 j = 0; j < MIB.AccessHistogramSize; ++j) {
u16 HistogramEntry =
encodeHistogramCount(((u64 *)(MIB.AccessHistogram))[j]);
Ptr = WriteBytes(HistogramEntry, Ptr);
}
if ((*h)->mib.AccessHistogramSize > 0) {
InternalFree((void *)((*h)->mib.AccessHistogram));
if (MIB.AccessHistogramSize > 0) {
InternalFree((void *)MIB.AccessHistogram);
}
}
CHECK(ExpectedNumBytes >= static_cast<u64>(Ptr - Buffer) &&
@ -249,7 +252,7 @@ u64 SerializeToRawProfile(MIBMapTy &MIBMap, ArrayRef<LoadedModule> Modules,
},
reinterpret_cast<void *>(&TotalAccessHistogramEntries));
const u64 NumHistogramBytes =
RoundUpTo(TotalAccessHistogramEntries * sizeof(uint64_t), 8);
RoundUpTo(TotalAccessHistogramEntries * sizeof(uint16_t), 8);
const u64 NumStackBytes = RoundUpTo(StackSizeBytes(StackIds), 8);

View File

@ -26,6 +26,7 @@ set(MEMPROF_SOURCES
../memprof_rawprofile.cpp)
set(MEMPROF_UNITTESTS
histogram_encoding.cpp
rawprofile.cpp
driver.cpp)

View File

@ -0,0 +1,35 @@
#include <cstdint>
#include <vector>
#include "profile/MemProfData.inc"
#include "gtest/gtest.h"
namespace llvm {
namespace memprof {
namespace {
TEST(MemProf, F16EncodeDecode) {
const std::vector<uint64_t> TestCases = {
0, 100, 4095, 4096, 5000, 8191, 65535, 1000000, 134213640, 200000000,
};
for (const uint64_t TestCase : TestCases) {
const uint16_t Encoded = encodeHistogramCount(TestCase);
const uint64_t Decoded = decodeHistogramCount(Encoded);
const uint64_t MaxRepresentable = static_cast<uint64_t>(MaxMantissa)
<< MaxExponent;
if (TestCase >= MaxRepresentable) {
EXPECT_EQ(Decoded, MaxRepresentable);
} else if (TestCase <= MaxMantissa) {
EXPECT_EQ(Decoded, TestCase);
} else {
// The decoded value should be close to the original value.
// The error should be less than 1/1024 for larger numbers.
EXPECT_NEAR(Decoded, TestCase, static_cast<double>(TestCase) / 1024.0);
}
}
}
} // namespace
} // namespace memprof
} // namespace llvm

View File

@ -33,11 +33,10 @@
(uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
// The version number of the raw binary format.
#define MEMPROF_RAW_VERSION 4ULL
#define MEMPROF_RAW_VERSION 5ULL
// Currently supported versions.
#define MEMPROF_RAW_SUPPORTED_VERSIONS \
{ 3ULL, 4ULL }
#define MEMPROF_RAW_SUPPORTED_VERSIONS {3ULL, 4ULL, 5ULL}
#define MEMPROF_V3_MIB_SIZE 132ULL;
@ -229,6 +228,41 @@ void Merge(const MemInfoBlock &newMIB) {
} __attribute__((__packed__));
#endif
constexpr int MantissaBits = 12;
constexpr int ExponentBits = 4;
constexpr uint16_t MaxMantissa = (1U << MantissaBits) - 1;
constexpr uint16_t MaxExponent = (1U << ExponentBits) - 1;
constexpr uint64_t MaxRepresentableValue = static_cast<uint64_t>(MaxMantissa)
<< MaxExponent;
// Encodes a 64-bit unsigned integer into a 16-bit scaled integer format.
inline uint16_t encodeHistogramCount(uint64_t Count) {
if (Count == 0)
return 0;
if (Count > MaxRepresentableValue)
Count = MaxRepresentableValue;
if (Count <= MaxMantissa)
return Count;
uint64_t M = Count;
uint16_t E = 0;
while (M > MaxMantissa) {
M = (M + 1) >> 1;
E++;
}
return (E << MantissaBits) | static_cast<uint16_t>(M);
}
// Decodes a 16-bit scaled integer and returns the
// decoded 64-bit unsigned integer.
inline uint64_t decodeHistogramCount(uint16_t EncodedValue) {
const uint16_t E = EncodedValue >> MantissaBits;
const uint16_t M = EncodedValue & MaxMantissa;
return static_cast<uint64_t>(M) << E;
}
} // namespace memprof
} // namespace llvm

View File

@ -135,7 +135,7 @@ readMemInfoBlocksV3(const char *Ptr) {
}
llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
readMemInfoBlocksV4(const char *Ptr) {
readMemInfoBlocksCommon(const char *Ptr, bool IsHistogramEncoded = false) {
using namespace support;
const uint64_t NumItemsToRead =
@ -145,27 +145,43 @@ readMemInfoBlocksV4(const char *Ptr) {
for (uint64_t I = 0; I < NumItemsToRead; I++) {
const uint64_t Id =
endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
// We cheat a bit here and remove the const from cast to set the
// Histogram Pointer to newly allocated buffer.
MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
// Only increment by size of MIB since readNext implicitly increments.
MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
Ptr += sizeof(MemInfoBlock);
if (MIB.AccessHistogramSize > 0) {
// The in-memory representation uses uint64_t for histogram entries.
MIB.AccessHistogram =
(uintptr_t)malloc(MIB.AccessHistogramSize * sizeof(uint64_t));
}
for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) {
((uint64_t *)MIB.AccessHistogram)[J] =
endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
for (uint64_t J = 0; J < MIB.AccessHistogramSize; J++) {
if (!IsHistogramEncoded) {
((uint64_t *)MIB.AccessHistogram)[J] =
endian::readNext<uint64_t, llvm::endianness::little, unaligned>(
Ptr);
} else {
// The encoded on-disk format (V5 onwards) uses uint16_t.
const uint16_t Val =
endian::readNext<uint16_t, llvm::endianness::little, unaligned>(
Ptr);
((uint64_t *)MIB.AccessHistogram)[J] = decodeHistogramCount(Val);
}
}
}
Items.push_back({Id, MIB});
}
return Items;
}
llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
readMemInfoBlocksV4(const char *Ptr) {
return readMemInfoBlocksCommon(Ptr);
}
llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>>
readMemInfoBlocksV5(const char *Ptr) {
return readMemInfoBlocksCommon(Ptr, /*IsHistogramEncoded=*/true);
}
CallStackMap readStackInfo(const char *Ptr) {
using namespace support;
@ -658,6 +674,8 @@ RawMemProfReader::readMemInfoBlocks(const char *Ptr) {
return readMemInfoBlocksV3(Ptr);
if (MemprofRawVersion == 4ULL)
return readMemInfoBlocksV4(Ptr);
if (MemprofRawVersion == 5ULL)
return readMemInfoBlocksV5(Ptr);
llvm_unreachable(
"Panic: Unsupported version number when reading MemInfoBlocks");
}

Binary file not shown.

View File

@ -7,7 +7,7 @@ We expect 5 MIBs, each with different AccessHistogramValues.
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 5
CHECK-NEXT: NumAllocFunctions: 3
@ -241,4 +241,4 @@ CHECK-NEXT: MinLifetimeAccessDensity: 56000
CHECK-NEXT: MaxLifetimeAccessDensity: 56000
CHECK-NEXT: AccessHistogramSize: 8
CHECK-NEXT: AccessHistogram: {{[0-9]+}}
CHECK-NEXT: AccessHistogramValues: 168 147 126 105 84 63 42 21
CHECK-NEXT: AccessHistogramValues: 168 147 126 105 84 63 42 21

View File

@ -8,7 +8,7 @@ additional allocations which do not originate from the main binary are pruned.
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 1
@ -96,4 +96,4 @@ CHECK-NEXT: TotalLifetimeAccessDensity: 20000
CHECK-NEXT: MinLifetimeAccessDensity: 20000
CHECK-NEXT: MaxLifetimeAccessDensity: 20000
CHECK-NEXT: AccessHistogramSize: 0
CHECK-NEXT: AccessHistogram: 0
CHECK-NEXT: AccessHistogram: 0

View File

@ -0,0 +1,102 @@
REQUIRES: x86_64-linux
This is a copy of memprof-basic.test with slight changes to check that we can still read v3 of memprofraw.
Inputs cannot and should not be updated.
RUN: llvm-profdata show --memory %p/Inputs/basic_v4.memprofraw --profiled-binary %p/Inputs/basic_v4.memprofexe -o - | FileCheck %s
We expect 2 MIB entries, 1 each for the malloc calls in the program. Any
additional allocations which do not originate from the main binary are pruned.
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 1
CHECK-NEXT: NumStackOffsets: 2
CHECK-NEXT: Segments:
CHECK-NEXT: -
CHECK-NEXT: BuildId: {{[[:xdigit:]]+}}
CHECK-NEXT: Start: 0x{{[[:xdigit:]]+}}
CHECK-NEXT: End: 0x{{[[:xdigit:]]+}}
CHECK-NEXT: Offset: 0x{{[[:xdigit:]]+}}
CHECK-NEXT: -
CHECK: Records:
CHECK-NEXT: -
CHECK-NEXT: FunctionGUID: {{[0-9]+}}
CHECK-NEXT: AllocSites:
CHECK-NEXT: -
CHECK-NEXT: Callstack:
CHECK-NEXT: -
CHECK-NEXT: Function: {{[0-9]+}}
CHECK-NEXT: SymbolName: main
CHECK-NEXT: LineOffset: 1
CHECK-NEXT: Column: 21
CHECK-NEXT: Inline: 0
CHECK-NEXT: MemInfoBlock:
CHECK-NEXT: AllocCount: 1
CHECK-NEXT: TotalAccessCount: 2
CHECK-NEXT: MinAccessCount: 2
CHECK-NEXT: MaxAccessCount: 2
CHECK-NEXT: TotalSize: 10
CHECK-NEXT: MinSize: 10
CHECK-NEXT: MaxSize: 10
CHECK-NEXT: AllocTimestamp: {{[0-9]+}}
CHECK-NEXT: DeallocTimestamp: {{[0-9]+}}
CHECK-NEXT: TotalLifetime: 0
CHECK-NEXT: MinLifetime: 0
CHECK-NEXT: MaxLifetime: 0
CHECK-NEXT: AllocCpuId: {{[0-9]+}}
CHECK-NEXT: DeallocCpuId: {{[0-9]+}}
CHECK-NEXT: NumMigratedCpu: 0
CHECK-NEXT: NumLifetimeOverlaps: 0
CHECK-NEXT: NumSameAllocCpu: 0
CHECK-NEXT: NumSameDeallocCpu: 0
CHECK-NEXT: DataTypeId: {{[0-9]+}}
CHECK-NEXT: TotalAccessDensity: 20
CHECK-NEXT: MinAccessDensity: 20
CHECK-NEXT: MaxAccessDensity: 20
CHECK-NEXT: TotalLifetimeAccessDensity: 20000
CHECK-NEXT: MinLifetimeAccessDensity: 20000
CHECK-NEXT: MaxLifetimeAccessDensity: 20000
CHECK-NEXT: AccessHistogramSize: 0
CHECK-NEXT: AccessHistogram: 0
CHECK-NEXT: -
CHECK-NEXT: Callstack:
CHECK-NEXT: -
CHECK-NEXT: Function: {{[0-9]+}}
CHECK-NEXT: SymbolName: main
CHECK-NEXT: LineOffset: 4
CHECK-NEXT: Column: 15
CHECK-NEXT: Inline: 0
CHECK-NEXT: MemInfoBlock:
CHECK-NEXT: AllocCount: 1
CHECK-NEXT: TotalAccessCount: 2
CHECK-NEXT: MinAccessCount: 2
CHECK-NEXT: MaxAccessCount: 2
CHECK-NEXT: TotalSize: 10
CHECK-NEXT: MinSize: 10
CHECK-NEXT: MaxSize: 10
CHECK-NEXT: AllocTimestamp: {{[0-9]+}}
CHECK-NEXT: DeallocTimestamp: {{[0-9]+}}
CHECK-NEXT: TotalLifetime: 0
CHECK-NEXT: MinLifetime: 0
CHECK-NEXT: MaxLifetime: 0
CHECK-NEXT: AllocCpuId: {{[0-9]+}}
CHECK-NEXT: DeallocCpuId: {{[0-9]+}}
CHECK-NEXT: NumMigratedCpu: 0
CHECK-NEXT: NumLifetimeOverlaps: 0
CHECK-NEXT: NumSameAllocCpu: 0
CHECK-NEXT: NumSameDeallocCpu: 0
CHECK-NEXT: DataTypeId: {{[0-9]+}}
CHECK-NEXT: TotalAccessDensity: 20
CHECK-NEXT: MinAccessDensity: 20
CHECK-NEXT: MaxAccessDensity: 20
CHECK-NEXT: TotalLifetimeAccessDensity: 20000
CHECK-NEXT: MinLifetimeAccessDensity: 20000
CHECK-NEXT: MaxLifetimeAccessDensity: 20000
CHECK-NEXT: AccessHistogramSize: 0
CHECK-NEXT: AccessHistogram: 0

View File

@ -5,7 +5,7 @@ RUN: llvm-profdata show --memory %p/Inputs/inline.memprofraw --profiled-binary %
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 2

View File

@ -7,7 +7,7 @@ We expect 2 MIB entries, 1 each for the malloc calls in the program.
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 1

View File

@ -7,7 +7,7 @@ We expect 2 different MIBs with histogram values. This test is to make sure we p
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 1
@ -96,4 +96,4 @@ CHEC-NEXT MinLifetimeAccessDensity: 8000
CHEC-NEXT MaxLifetimeAccessDensity: 8000
CHEC-NEXT AccessHistogramSize: 6
CHEC-NEXT AccessHistogram: {{[0-9]+}}
CHEC-NEXT AccessHistogramValues: -2 -0 -0 -0 -1 -1
CHEC-NEXT AccessHistogramValues: -2 -0 -0 -0 -1 -1

View File

@ -11,7 +11,7 @@ RUN: llvm-profdata show --memory %p/Inputs/pic.memprofraw --profiled-binary %p/I
CHECK: MemprofProfile:
CHECK-NEXT: Summary:
CHECK-NEXT: Version: 4
CHECK-NEXT: Version: 5
CHECK-NEXT: NumSegments: {{[0-9]+}}
CHECK-NEXT: NumMibInfo: 2
CHECK-NEXT: NumAllocFunctions: 1
@ -100,4 +100,4 @@ CHECK-NEXT: TotalLifetimeAccessDensity: 20000
CHECK-NEXT: MinLifetimeAccessDensity: 20000
CHECK-NEXT: MaxLifetimeAccessDensity: 20000
CHECK-NEXT: AccessHistogramSize: 0
CHECK-NEXT: AccessHistogram: 0
CHECK-NEXT: AccessHistogram: 0