llvm-project/compiler-rt/lib/xray/tests/unit/function_call_trie_test.cc
Dean Michael Berris 9d6b7a5f2b [XRay][compiler-rt] Simplify Allocator Implementation
Summary:
This change simplifies the XRay Allocator implementation to self-manage
an mmap'ed memory segment instead of using the internal allocator
implementation in sanitizer_common.

We've found through benchmarks and profiling these benchmarks in D48879
that using the internal allocator in sanitizer_common introduces a
bottleneck on allocating memory through a central spinlock. This change
allows thread-local allocators to eliminate contention on the
centralized allocator.

To get the most benefit from this approach, we also use a managed
allocator for the chunk elements used by the segmented array
implementation. This gives us the chance to amortize the cost of
allocating memory when creating these internal segmented array data
structures.

We also took the opportunity to remove the preallocation argument from
the allocator API, simplifying the usage of the allocator throughout the
profiling implementation.

In this change we also tweak some of the flag values to reduce the
amount of maximum memory we use/need for each thread, when requesting
memory through mmap.

Depends on D48956.

Reviewers: kpw, eizan

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D49217

llvm-svn: 337342
2018-07-18 01:53:39 +00:00

287 lines
8.3 KiB
C++

//===-- function_call_trie_test.cc ----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of XRay, a function call tracing system.
//
//===----------------------------------------------------------------------===//
#include "gtest/gtest.h"
#include "xray_function_call_trie.h"
namespace __xray {
namespace {
TEST(FunctionCallTrieTest, ConstructWithTLSAllocators) {
profilingFlags()->setDefaults();
FunctionCallTrie::Allocators Allocators = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(Allocators);
}
TEST(FunctionCallTrieTest, EnterAndExitFunction) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(1, 1);
Trie.exitFunction(1, 2);
// We need a way to pull the data out. At this point, until we get a data
// collection service implemented, we're going to export the data as a list of
// roots, and manually walk through the structure ourselves.
const auto &R = Trie.getRoots();
ASSERT_EQ(R.size(), 1u);
ASSERT_EQ(R.front()->FId, 1);
ASSERT_EQ(R.front()->CallCount, 1);
ASSERT_EQ(R.front()->CumulativeLocalTime, 1u);
}
TEST(FunctionCallTrieTest, MissingFunctionEntry) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.exitFunction(1, 1);
const auto &R = Trie.getRoots();
ASSERT_TRUE(R.empty());
}
TEST(FunctionCallTrieTest, NoMatchingEntersForExit) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(2, 1);
Trie.enterFunction(3, 3);
Trie.exitFunction(1, 5);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
EXPECT_EQ(R.size(), size_t{1});
}
TEST(FunctionCallTrieTest, MissingFunctionExit) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(1, 1);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
EXPECT_EQ(R.size(), size_t{1});
}
TEST(FunctionCallTrieTest, MultipleRoots) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
// Enter and exit FId = 1.
Trie.enterFunction(1, 1);
Trie.exitFunction(1, 2);
// Enter and exit FId = 2.
Trie.enterFunction(2, 3);
Trie.exitFunction(2, 4);
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
ASSERT_EQ(R.size(), 2u);
// Make sure the roots have different IDs.
const auto R0 = R[0];
const auto R1 = R[1];
ASSERT_NE(R0->FId, R1->FId);
// Inspect the roots that they have the right data.
ASSERT_NE(R0, nullptr);
EXPECT_EQ(R0->CallCount, 1u);
EXPECT_EQ(R0->CumulativeLocalTime, 1u);
ASSERT_NE(R1, nullptr);
EXPECT_EQ(R1->CallCount, 1u);
EXPECT_EQ(R1->CumulativeLocalTime, 1u);
}
// While missing an intermediary entry may be rare in practice, we still enforce
// that we can handle the case where we've missed the entry event somehow, in
// between call entry/exits. To illustrate, imagine the following shadow call
// stack:
//
// f0@t0 -> f1@t1 -> f2@t2
//
// If for whatever reason we see an exit for `f2` @ t3, followed by an exit for
// `f0` @ t4 (i.e. no `f1` exit in between) then we need to handle the case of
// accounting local time to `f2` from d = (t3 - t2), then local time to `f1`
// as d' = (t3 - t1) - d, and then local time to `f0` as d'' = (t3 - t0) - d'.
TEST(FunctionCallTrieTest, MissingIntermediaryExit) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(1, 0);
Trie.enterFunction(2, 100);
Trie.enterFunction(3, 200);
Trie.exitFunction(3, 300);
Trie.exitFunction(1, 400);
// What we should see at this point is all the functions in the trie in a
// specific order (1 -> 2 -> 3) with the appropriate count(s) and local
// latencies.
const auto &R = Trie.getRoots();
ASSERT_FALSE(R.empty());
ASSERT_EQ(R.size(), 1u);
const auto &F1 = *R[0];
ASSERT_EQ(F1.FId, 1);
ASSERT_FALSE(F1.Callees.empty());
const auto &F2 = *F1.Callees[0].NodePtr;
ASSERT_EQ(F2.FId, 2);
ASSERT_FALSE(F2.Callees.empty());
const auto &F3 = *F2.Callees[0].NodePtr;
ASSERT_EQ(F3.FId, 3);
ASSERT_TRUE(F3.Callees.empty());
// Now that we've established the preconditions, we check for specific aspects
// of the nodes.
EXPECT_EQ(F3.CallCount, 1);
EXPECT_EQ(F2.CallCount, 1);
EXPECT_EQ(F1.CallCount, 1);
EXPECT_EQ(F3.CumulativeLocalTime, 100);
EXPECT_EQ(F2.CumulativeLocalTime, 300);
EXPECT_EQ(F1.CumulativeLocalTime, 100);
}
TEST(FunctionCallTrieTest, DeepCallStack) {
// Simulate a relatively deep call stack (32 levels) and ensure that we can
// properly pop all the way up the stack.
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
for (int i = 0; i < 32; ++i)
Trie.enterFunction(i + 1, i);
Trie.exitFunction(1, 33);
// Here, validate that we have a 32-level deep function call path from the
// root (1) down to the leaf (33).
const auto &R = Trie.getRoots();
ASSERT_EQ(R.size(), 1u);
auto F = R[0];
for (int i = 0; i < 32; ++i) {
EXPECT_EQ(F->FId, i + 1);
EXPECT_EQ(F->CallCount, 1);
if (F->Callees.empty() && i != 31)
FAIL() << "Empty callees for FId " << F->FId;
if (i != 31)
F = F->Callees[0].NodePtr;
}
}
// TODO: Test that we can handle cross-CPU migrations, where TSCs are not
// guaranteed to be synchronised.
TEST(FunctionCallTrieTest, DeepCopy) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie Trie(A);
Trie.enterFunction(1, 0);
Trie.enterFunction(2, 1);
Trie.exitFunction(2, 2);
Trie.enterFunction(3, 3);
Trie.exitFunction(3, 4);
Trie.exitFunction(1, 5);
// We want to make a deep copy and compare notes.
auto B = FunctionCallTrie::InitAllocators();
FunctionCallTrie Copy(B);
Trie.deepCopyInto(Copy);
ASSERT_NE(Trie.getRoots().size(), 0u);
ASSERT_EQ(Trie.getRoots().size(), Copy.getRoots().size());
const auto &R0Orig = *Trie.getRoots()[0];
const auto &R0Copy = *Copy.getRoots()[0];
EXPECT_EQ(R0Orig.FId, 1);
EXPECT_EQ(R0Orig.FId, R0Copy.FId);
ASSERT_EQ(R0Orig.Callees.size(), 2u);
ASSERT_EQ(R0Copy.Callees.size(), 2u);
const auto &F1Orig =
*R0Orig.Callees
.find_element(
[](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
->NodePtr;
const auto &F1Copy =
*R0Copy.Callees
.find_element(
[](const FunctionCallTrie::NodeIdPair &R) { return R.FId == 2; })
->NodePtr;
EXPECT_EQ(&R0Orig, F1Orig.Parent);
EXPECT_EQ(&R0Copy, F1Copy.Parent);
}
TEST(FunctionCallTrieTest, MergeInto) {
profilingFlags()->setDefaults();
auto A = FunctionCallTrie::InitAllocators();
FunctionCallTrie T0(A);
FunctionCallTrie T1(A);
// 1 -> 2 -> 3
T0.enterFunction(1, 0);
T0.enterFunction(2, 1);
T0.enterFunction(3, 2);
T0.exitFunction(3, 3);
T0.exitFunction(2, 4);
T0.exitFunction(1, 5);
// 1 -> 2 -> 3
T1.enterFunction(1, 0);
T1.enterFunction(2, 1);
T1.enterFunction(3, 2);
T1.exitFunction(3, 3);
T1.exitFunction(2, 4);
T1.exitFunction(1, 5);
// We use a different allocator here to make sure that we're able to transfer
// data into a FunctionCallTrie which uses a different allocator. This
// reflects the inteded usage scenario for when we're collecting profiles that
// aggregate across threads.
auto B = FunctionCallTrie::InitAllocators();
FunctionCallTrie Merged(B);
T0.mergeInto(Merged);
T1.mergeInto(Merged);
ASSERT_EQ(Merged.getRoots().size(), 1u);
const auto &R0 = *Merged.getRoots()[0];
EXPECT_EQ(R0.FId, 1);
EXPECT_EQ(R0.CallCount, 2);
EXPECT_EQ(R0.CumulativeLocalTime, 10);
EXPECT_EQ(R0.Callees.size(), 1u);
const auto &F1 = *R0.Callees[0].NodePtr;
EXPECT_EQ(F1.FId, 2);
EXPECT_EQ(F1.CallCount, 2);
EXPECT_EQ(F1.CumulativeLocalTime, 6);
EXPECT_EQ(F1.Callees.size(), 1u);
const auto &F2 = *F1.Callees[0].NodePtr;
EXPECT_EQ(F2.FId, 3);
EXPECT_EQ(F2.CallCount, 2);
EXPECT_EQ(F2.CumulativeLocalTime, 2);
EXPECT_EQ(F2.Callees.size(), 0u);
}
} // namespace
} // namespace __xray