llvm-project/llvm/unittests/Support/SuffixTreeTest.cpp
Xuan Zhang d9a00ed366
[MachineOutliner] Leaf Descendants (#90275)
This PR  depends on https://github.com/llvm/llvm-project/pull/90264

In the current implementation, only leaf children of each internal node
in the suffix tree are included as candidates for outlining. But all
leaf descendants are outlining candidates, which we include in the new
implementation. This is enabled on a flag `outliner-leaf-descendants`
which is default to be true.

The reason for _enabling this on a flag_ is because machine outliner is
not the only pass that uses suffix tree.

The reason for _having this default to be true_ is because including all
leaf descendants show consistent size win.
* For Clang/LLD, it shows around 3% reduction in text segment size when
compared to the baseline `-Oz` linker binary.
 * For selected benchmark tests in LLVM test suite 
 
| run (CTMark/) | only leaf children | all leaf descendants | reduction
% |

|------------------|--------------------|----------------------|-------------|
| lencod | 349624 | 348564 | -0.2004% |
| SPASS | 219672 | 218440 | -0.4738% |
| kc | 271956 | 250068 | -0.4506% |
| sqlite3 | 223920 | 222484 | -0.5471% |
| 7zip-benchmark | 405364 | 401244 | -0.3428% |
| bullet | 139820 | 138340 | -0.8315% |
| consumer-typeset | 295684 | 286628 | -1.2295% |
| pairlocalalign | 72236 | 71936 | -0.2164% |
| tramp3d-v4 | 189572 | 183676 | -2.9668% |

This is part of an enhanced version of machine outliner -- see
[RFC](https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-1-fulllto-part-2-thinlto-nolto-to-come/78732).
2024-06-18 07:13:05 -07:00

278 lines
11 KiB
C++

//===- unittests/Support/SuffixTreeTest.cpp - suffix tree tests -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/SuffixTree.h"
#include "gtest/gtest.h"
#include <vector>
using namespace llvm;
namespace {
// Each example vector has a unique element at the end to represent the end of
// the string
// Tests that The SuffixTree finds a simple repetition of the substring {1, 2}
// {1, 2} twice in the provided string.
TEST(SuffixTreeTest, TestSingleRepetition) {
std::vector<unsigned> SimpleRepetitionData = {1, 2, 1, 2, 3};
SuffixTree ST(SimpleRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
ASSERT_EQ(SubStrings.size(), 1u);
EXPECT_EQ(SubStrings[0].Length, 2u);
EXPECT_EQ(SubStrings[0].StartIndices.size(), 2u);
for (unsigned StartIdx : SubStrings[0].StartIndices) {
EXPECT_EQ(SimpleRepetitionData[StartIdx], 1u);
EXPECT_EQ(SimpleRepetitionData[StartIdx + 1], 2u);
}
}
// Tests that the SuffixTree is able to find the substrings {1, 2, 3} at
// at indices 0 and 3 as well as the substrings {2, 3} at indices 1 and 4.
// This test also serves as a flag for improvements to the suffix tree.
//
// FIXME: Right now, the longest repeated substring from a specific index is
// returned, this could be improved to return the longest repeated substring, as
// well as those that are smaller.
TEST(SuffixTreeTest, TestLongerRepetition) {
std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3, 4};
SuffixTree ST(RepeatedRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 2u);
unsigned Len;
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
Len = RS.Length;
bool IsExpectedLen = (Len == 3u || Len == 2u);
bool IsExpectedIndex;
ASSERT_TRUE(IsExpectedLen);
if (Len == 3u) {
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
}
} else {
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 1u || StartIdx == 4u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
}
}
}
}
// Tests that the SuffixTree is able to find substring {1, 1, 1, 1, 1} at
// indices 0 and 1.
//
// FIXME: Add support for detecting {1, 1} and {1, 1, 1}
// See Test TestSingleCharacterRepeatWithLeafDescendants for the fix
TEST(SuffixTreeTest, TestSingleCharacterRepeat) {
std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
std::vector<unsigned>::iterator RRDIt, RRDIt2;
SuffixTree ST(RepeatedRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 1u);
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
EXPECT_EQ(RS.StartIndices.size(),
RepeatedRepetitionData.size() - RS.Length);
for (unsigned StartIdx : SubStrings[0].StartIndices) {
RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
std::advance(RRDIt, StartIdx);
std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
ASSERT_TRUE(
all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
[](unsigned Elt) { return Elt == 1; }));
}
}
}
// Tests that the SuffixTree is able to find the following substrings:
// {1, 1} at indices 0, 1, 2, 3, and 4;
// {1, 1, 1} at indices 0, 1, 2, and 3;
// {1, 1, 1, 1} at indices 0, 1, and 2; and
// {1, 1, 1, 1, 1} at indices 0 and 1.
//
// This is the fix for TestSingleCharacterRepeat.
TEST(SuffixTreeTest, TestSingleCharacterRepeatWithLeafDescendants) {
std::vector<unsigned> RepeatedRepetitionData = {1, 1, 1, 1, 1, 1, 2};
std::vector<unsigned>::iterator RRDIt, RRDIt2;
SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 4u);
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
EXPECT_EQ(RS.StartIndices.size(),
RepeatedRepetitionData.size() - RS.Length);
for (unsigned StartIdx : SubStrings[0].StartIndices) {
RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
std::advance(RRDIt, StartIdx);
std::advance(RRDIt2, StartIdx + SubStrings[0].Length);
ASSERT_TRUE(
all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
[](unsigned Elt) { return Elt == 1; }));
}
}
}
// The suffix tree cannot currently find repeated substrings in strings of the
// form {1, 2, 3, 1, 2, 3}, because the two {1, 2, 3}s are adjacent ("tandem
// repeats")
//
// FIXME: Teach the SuffixTree to recognize these cases.
TEST(SuffixTreeTest, TestTandemRepeat) {
std::vector<unsigned> RepeatedRepetitionData = {1, 2, 3, 1, 2, 3};
SuffixTree ST(RepeatedRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 0u);
}
// Tests that the SuffixTree does not erroneously include values that are not
// in repeated substrings. That is, only finds {1, 1} at indices 0 and 3 and
// does not include 2 and 3.
TEST(SuffixTreeTest, TestExclusion) {
std::vector<unsigned> RepeatedRepetitionData = {1, 1, 2, 1, 1, 3};
std::vector<unsigned>::iterator RRDIt, RRDIt2;
SuffixTree ST(RepeatedRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 1u);
bool IsExpectedIndex;
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
EXPECT_TRUE(IsExpectedIndex);
RRDIt = RRDIt2 = RepeatedRepetitionData.begin();
std::advance(RRDIt, StartIdx);
std::advance(RRDIt2, StartIdx + RS.Length);
EXPECT_TRUE(
all_of(make_range<std::vector<unsigned>::iterator>(RRDIt, RRDIt2),
[](unsigned Elt) { return Elt == 1; }));
}
}
}
// Tests that the SuffixTree is able to find three substrings
// {1, 2, 3} at indices 6 and 10;
// {2, 3} at indices 7 and 11; and
// {1, 2} at indicies 0 and 3.
//
// FIXME: {1, 2} has indices 6 and 10 missing as it is a substring of {1, 2, 3}
// See Test TestSubstringRepeatsWithLeafDescendants for the fix
TEST(SuffixTreeTest, TestSubstringRepeats) {
std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
2, 3, 103, 1, 2, 3, 104};
SuffixTree ST(RepeatedRepetitionData);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 3u);
unsigned Len;
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
Len = RS.Length;
bool IsExpectedLen = (Len == 3u || Len == 2u);
ASSERT_TRUE(IsExpectedLen);
bool IsExpectedIndex;
if (Len == 3u) { // {1, 2, 3}
EXPECT_EQ(RS.StartIndices.size(), 2u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
}
} else {
if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
EXPECT_EQ(RS.StartIndices.size(), 2u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
}
} else { // {2, 3}
EXPECT_EQ(RS.StartIndices.size(), 2u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
}
}
}
}
}
// Tests that the SuffixTree is able to find three substrings
// {1, 2, 3} at indices 6 and 10;
// {2, 3} at indices 7 and 11; and
// {1, 2} at indicies 0, 3, 6, and 10.
//
// This is the fix for TestSubstringRepeats
TEST(SuffixTreeTest, TestSubstringRepeatsWithLeafDescendants) {
std::vector<unsigned> RepeatedRepetitionData = {1, 2, 100, 1, 2, 101, 1,
2, 3, 103, 1, 2, 3, 104};
SuffixTree ST(RepeatedRepetitionData, /*OutlinerLeafDescendants=*/true);
std::vector<SuffixTree::RepeatedSubstring> SubStrings;
for (auto It = ST.begin(); It != ST.end(); It++)
SubStrings.push_back(*It);
EXPECT_EQ(SubStrings.size(), 3u);
unsigned Len;
for (SuffixTree::RepeatedSubstring &RS : SubStrings) {
Len = RS.Length;
bool IsExpectedLen = (Len == 3u || Len == 2u);
ASSERT_TRUE(IsExpectedLen);
bool IsExpectedIndex;
if (Len == 3u) { // {1, 2, 3}
EXPECT_EQ(RS.StartIndices.size(), 2u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 6u || StartIdx == 10u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 1u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 2], 3u);
}
} else {
if (RepeatedRepetitionData[RS.StartIndices[0]] == 1u) { // {1, 2}
EXPECT_EQ(RS.StartIndices.size(), 4u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 0u || StartIdx == 3u ||
StartIdx == 6u || StartIdx == 10u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 2u);
}
} else { // {2, 3}
EXPECT_EQ(RS.StartIndices.size(), 2u);
for (unsigned StartIdx : RS.StartIndices) {
IsExpectedIndex = (StartIdx == 7u || StartIdx == 11u);
EXPECT_TRUE(IsExpectedIndex);
EXPECT_EQ(RepeatedRepetitionData[StartIdx], 2u);
EXPECT_EQ(RepeatedRepetitionData[StartIdx + 1], 3u);
}
}
}
}
}
} // namespace