This would lead to "error: too many open files" when processing large numbers of commits.
268 lines
11 KiB
Python
Executable File
268 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
|
|
import argparse
|
|
import datetime
|
|
import functools
|
|
import os
|
|
import pathlib
|
|
import re
|
|
import statistics
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
|
|
import git
|
|
import pandas
|
|
import plotly
|
|
import plotly.express
|
|
import tqdm
|
|
|
|
@functools.total_ordering
|
|
class Commit:
|
|
"""
|
|
This class represents a commit inside a given Git repository.
|
|
"""
|
|
|
|
def __init__(self, git_repo: git.Repo, sha: str):
|
|
self._git_repo = git_repo
|
|
self._sha = sha
|
|
|
|
def __eq__(self, other):
|
|
"""
|
|
Return whether two commits refer to the same commit.
|
|
|
|
This doesn't take into account the content of the Git tree at those commits, only the
|
|
'identity' of the commits themselves.
|
|
"""
|
|
return self.fullrev == other.fullrev
|
|
|
|
def __lt__(self, other):
|
|
"""
|
|
Return whether a commit is an ancestor of another commit in the Git repository.
|
|
"""
|
|
# Is self._sha an ancestor of other._sha?
|
|
res = subprocess.run(['git', '-C', self._git_repo.git_dir, 'merge-base', '--is-ancestor', self._sha, other._sha])
|
|
if res.returncode not in (0, 1):
|
|
raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
|
|
return res.returncode == 0
|
|
|
|
def __hash__(self):
|
|
"""
|
|
Return the full revision for this commit.
|
|
"""
|
|
return hash(self.fullrev)
|
|
|
|
@functools.cache
|
|
def show(self, include_diff=False):
|
|
"""
|
|
Return the commit information equivalent to `git show` associated to this commit.
|
|
"""
|
|
cmd = ['git', '-C', self._git_repo.git_dir, 'show', self._sha]
|
|
if not include_diff:
|
|
cmd.append('--no-patch')
|
|
return subprocess.check_output(cmd, text=True)
|
|
|
|
@functools.cached_property
|
|
def shortrev(self):
|
|
"""
|
|
Return the shortened version of the given SHA.
|
|
"""
|
|
return subprocess.check_output(['git', '-C', self._git_repo.git_dir, 'rev-parse', '--short', self._sha], text=True).strip()
|
|
|
|
@functools.cached_property
|
|
def fullrev(self):
|
|
"""
|
|
Return the full SHA associated to this commit.
|
|
"""
|
|
return subprocess.check_output(['git', '-C', self._git_repo.git_dir, 'rev-parse', self._sha], text=True).strip()
|
|
|
|
@functools.cached_property
|
|
def commit_date(self):
|
|
"""
|
|
Return the date of the commit as a `datetime.datetime` object.
|
|
"""
|
|
return datetime.datetime.fromtimestamp(self._git_repo.commit(self._sha).committed_date)
|
|
|
|
def prefetch(self):
|
|
"""
|
|
Prefetch cached properties associated to this commit object.
|
|
|
|
This makes it possible to control when time is spent recovering that information from Git for
|
|
e.g. better reporting to the user.
|
|
"""
|
|
self.commit_date
|
|
self.fullrev
|
|
self.shortrev
|
|
self.show()
|
|
|
|
def __str__(self):
|
|
return self._sha
|
|
|
|
def truncate_lines(string, n, marker=None):
|
|
"""
|
|
Truncate the given string at a certain number of lines.
|
|
|
|
Optionally, add a marker on the last line to identify that truncation has happened.
|
|
"""
|
|
lines = string.splitlines()
|
|
truncated = lines[:n]
|
|
if marker is not None and len(lines) > len(truncated):
|
|
truncated[-1] = marker
|
|
assert len(truncated) <= n, "broken post-condition"
|
|
return '\n'.join(truncated)
|
|
|
|
def create_plot(data, metric, trendline=None, subtitle=None):
|
|
"""
|
|
Create a plot object showing the evolution of each benchmark throughout the given commits for
|
|
the given metric.
|
|
"""
|
|
data = data.sort_values(by=['revlist_order', 'benchmark'])
|
|
revlist = pandas.unique(data['commit']) # list of all commits in chronological order
|
|
hover_info = {c: truncate_lines(c.show(), 30, marker='...').replace('\n', '<br>') for c in revlist}
|
|
figure = plotly.express.scatter(data, title=f"{revlist[0].shortrev} to {revlist[-1].shortrev}",
|
|
subtitle=subtitle,
|
|
x='revlist_order', y=metric,
|
|
symbol='benchmark',
|
|
color='benchmark',
|
|
hover_name=[hover_info[c] for c in data['commit']],
|
|
trendline=trendline)
|
|
return figure
|
|
|
|
def directory_path(string):
|
|
if os.path.isdir(string):
|
|
return pathlib.Path(string)
|
|
else:
|
|
raise NotADirectoryError(string)
|
|
|
|
def parse_lnt(lines, aggregate=statistics.median):
|
|
"""
|
|
Parse lines in LNT format and return a list of dictionnaries of the form:
|
|
|
|
[
|
|
{
|
|
'benchmark': <benchmark1>,
|
|
<metric1>: float,
|
|
<metric2>: float,
|
|
...
|
|
},
|
|
{
|
|
'benchmark': <benchmark2>,
|
|
<metric1>: float,
|
|
<metric2>: float,
|
|
...
|
|
},
|
|
...
|
|
]
|
|
|
|
If a metric has multiple values associated to it, they are aggregated into a single
|
|
value using the provided aggregation function.
|
|
"""
|
|
results = {}
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
(identifier, value) = line.split(' ')
|
|
(benchmark, metric) = identifier.split('.')
|
|
if benchmark not in results:
|
|
results[benchmark] = {'benchmark': benchmark}
|
|
|
|
entry = results[benchmark]
|
|
if metric not in entry:
|
|
entry[metric] = []
|
|
entry[metric].append(float(value))
|
|
|
|
for (bm, entry) in results.items():
|
|
for metric in entry:
|
|
if isinstance(entry[metric], list):
|
|
entry[metric] = aggregate(entry[metric])
|
|
|
|
return list(results.values())
|
|
|
|
def sorted_revlist(git_repo, commits):
|
|
"""
|
|
Return the list of commits sorted by their chronological order (from oldest to newest) in the
|
|
provided Git repository. Items earlier in the list are older than items later in the list.
|
|
"""
|
|
revlist_cmd = ['git', '-C', git_repo, 'rev-list', '--no-walk'] + list(commits)
|
|
revlist = subprocess.check_output(revlist_cmd, text=True).strip().splitlines()
|
|
return list(reversed(revlist))
|
|
|
|
def main(argv):
|
|
parser = argparse.ArgumentParser(
|
|
prog='visualize-historical',
|
|
description='Visualize historical data in LNT format. This program generates a HTML file that embeds an '
|
|
'interactive plot with the provided data. The HTML file can then be opened in a browser to '
|
|
'visualize the data as a chart.',
|
|
epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.')
|
|
parser.add_argument('directory', type=directory_path,
|
|
help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
|
|
'This is also the format generated by the `benchmark-historical` utility.')
|
|
parser.add_argument('--output', '-o', type=pathlib.Path, required=False,
|
|
help='Optional path where to output the resulting HTML file. If it already exists, it is overwritten. '
|
|
'Defaults to a temporary file which is opened automatically once generated, but not removed after '
|
|
'creation.')
|
|
parser.add_argument('--metric', type=str, default='execution_time',
|
|
help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- '
|
|
'this option allows selecting which metric is being visualized. The default is "execution_time".')
|
|
parser.add_argument('--filter', type=str, required=False,
|
|
help='An optional regular expression used to filter the benchmarks included in the chart. '
|
|
'Only benchmarks whose names match the regular expression will be included. '
|
|
'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
|
|
'and to then filter them in the browser, but in some cases producing a chart with a reduced '
|
|
'number of data series is useful.')
|
|
parser.add_argument('--subtitle', type=str, required=False,
|
|
help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.')
|
|
parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
|
|
help='Path to the git repository to use for ordering commits in time. '
|
|
'By default, the current working directory is used.')
|
|
parser.add_argument('--open', action='store_true',
|
|
help='Whether to automatically open the generated HTML file when finished. If no output file is provided, '
|
|
'the resulting benchmark is opened automatically by default.')
|
|
parser.add_argument('--trendline', type=str, required=False, default=None, choices=('ols', 'lowess', 'expanding'),
|
|
help='Optional trendline to add on each series in the chart. See the documentation in '
|
|
'https://plotly.com/python-api-reference/generated/plotly.express.trendline_functions.html '
|
|
'details on each option.')
|
|
args = parser.parse_args(argv)
|
|
repo = git.Repo(args.git_repo)
|
|
|
|
# Extract benchmark data from the directory.
|
|
data = {}
|
|
files = [f for f in args.directory.glob('*.lnt')]
|
|
for file in tqdm.tqdm(files, desc='Parsing LNT files'):
|
|
rows = parse_lnt(file.read_text().splitlines())
|
|
(commit, _) = os.path.splitext(os.path.basename(file))
|
|
commit = Commit(repo, commit)
|
|
data[commit] = rows
|
|
|
|
# Obtain commit information which is then cached throughout the program. Do this
|
|
# eagerly so we can provide a progress bar.
|
|
for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'):
|
|
commit.prefetch()
|
|
|
|
# Create a dataframe from the raw data and add some columns to it:
|
|
# - 'commit' represents the Commit object associated to the results in that row
|
|
# - `revlist_order` represents the order of the commit within the Git repository.
|
|
# - `date` represents the commit date
|
|
revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()])
|
|
data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows])
|
|
data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']]))
|
|
data = data.join(pandas.DataFrame([{'date': c.commit_date} for c in data['commit']]))
|
|
|
|
# Filter the benchmarks if needed.
|
|
if args.filter is not None:
|
|
keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
|
|
data = data[data['benchmark'].isin(keeplist)]
|
|
if len(data) == 0:
|
|
raise RuntimeError(f'Filter "{args.filter}" resulted in empty data set -- nothing to plot')
|
|
|
|
# Plot the data for all the required benchmarks.
|
|
figure = create_plot(data, args.metric, trendline=args.trendline, subtitle=args.subtitle)
|
|
do_open = args.output is None or args.open
|
|
output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
|
|
plotly.io.write_html(figure, file=output, auto_open=do_open)
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv[1:])
|