llvm-project/libcxx/utils/visualize-historical

#!/usr/bin/env python3

import argparse
import datetime
import functools
import os
import pathlib
import re
import statistics
import subprocess
import sys
import tempfile

import git
import pandas
import plotly
import plotly.express
import tqdm

@functools.total_ordering
class Commit:
    """
    This class represents a commit inside a given Git repository.
    """

    def __init__(self, git_repo: git.Repo, sha: str):
        self._git_repo = git_repo
        self._sha = sha

    def __eq__(self, other):
        """
        Return whether two commits refer to the same commit.

        This doesn't take into account the content of the Git tree at those commits, only the
        'identity' of the commits themselves.
        """
        return self.fullrev == other.fullrev

    def __lt__(self, other):
        """
        Return whether a commit is an ancestor of another commit in the Git repository.
        """
        # Is self._sha an ancestor of other._sha?
        res = subprocess.run(['git', '-C', self._git_repo.git_dir, 'merge-base', '--is-ancestor', self._sha, other._sha])
        if res.returncode not in (0, 1):
            raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
        return res.returncode == 0

    def __hash__(self):
        """
        Return the full revision for this commit.
        """
        return hash(self.fullrev)

    @functools.cache
    def show(self, include_diff=False):
        """
        Return the commit information equivalent to `git show` associated to this commit.
        """
        cmd = ['git', '-C', self._git_repo.git_dir, 'show', self._sha]
        if not include_diff:
            cmd.append('--no-patch')
        return subprocess.check_output(cmd, text=True)

    @functools.cached_property
    def shortrev(self):
        """
        Return the shortened version of the given SHA.
        """
        return subprocess.check_output(['git', '-C', self._git_repo.git_dir, 'rev-parse', '--short', self._sha], text=True).strip()

    @functools.cached_property
    def fullrev(self):
        """
        Return the full SHA associated to this commit.
        """
        return subprocess.check_output(['git', '-C', self._git_repo.git_dir, 'rev-parse', self._sha], text=True).strip()

    @functools.cached_property
    def commit_date(self):
        """
        Return the date of the commit as a `datetime.datetime` object.
        """
        return datetime.datetime.fromtimestamp(self._git_repo.commit(self._sha).committed_date)

    def prefetch(self):
        """
        Prefetch cached properties associated to this commit object.

        This makes it possible to control when time is spent recovering that information from Git for
        e.g. better reporting to the user.
        """
        self.commit_date
        self.fullrev
        self.shortrev
        self.show()

    def __str__(self):
        return self._sha

def truncate_lines(string, n, marker=None):
    """
    Truncate the given string at a certain number of lines.

    Optionally, add a marker on the last line to identify that truncation has happened.
    """
    lines = string.splitlines()
    truncated = lines[:n]
    if marker is not None and len(lines) > len(truncated):
        truncated[-1] = marker
    assert len(truncated) <= n, "broken post-condition"
    return '\n'.join(truncated)

def create_plot(data, metric, trendline=None, subtitle=None):
    """
    Create a plot object showing the evolution of each benchmark throughout the given commits for
    the given metric.
    """
    data = data.sort_values(by=['revlist_order', 'benchmark'])
    revlist = pandas.unique(data['commit']) # list of all commits in chronological order
    hover_info = {c: truncate_lines(c.show(), 30, marker='...').replace('\n', '<br>') for c in revlist}
    figure = plotly.express.scatter(data, title=f"{revlist[0].shortrev} to {revlist[-1].shortrev}",
                                          subtitle=subtitle,
                                          x='revlist_order', y=metric,
                                          symbol='benchmark',
                                          color='benchmark',
                                          hover_name=[hover_info[c] for c in data['commit']],
                                          trendline=trendline)
    return figure

def directory_path(string):
    if os.path.isdir(string):
        return pathlib.Path(string)
    else:
        raise NotADirectoryError(string)

def parse_lnt(lines, aggregate=statistics.median):
    """
    Parse lines in LNT format and return a list of dictionnaries of the form:

        [
            {
                'benchmark': <benchmark1>,
                <metric1>: float,
                <metric2>: float,
                ...
            },
            {
                'benchmark': <benchmark2>,
                <metric1>: float,
                <metric2>: float,
                ...
            },
            ...
        ]

    If a metric has multiple values associated to it, they are aggregated into a single
    value using the provided aggregation function.
    """
    results = {}
    for line in lines:
        line = line.strip()
        if not line:
            continue

        (identifier, value) = line.split(' ')
        (benchmark, metric) = identifier.split('.')
        if benchmark not in results:
            results[benchmark] = {'benchmark': benchmark}

        entry = results[benchmark]
        if metric not in entry:
            entry[metric] = []
        entry[metric].append(float(value))

    for (bm, entry) in results.items():
        for metric in entry:
            if isinstance(entry[metric], list):
                entry[metric] = aggregate(entry[metric])

    return list(results.values())

def sorted_revlist(git_repo, commits):
    """
    Return the list of commits sorted by their chronological order (from oldest to newest) in the
    provided Git repository. Items earlier in the list are older than items later in the list.
    """
    revlist_cmd = ['git', '-C', git_repo, 'rev-list', '--no-walk'] + list(commits)
    revlist = subprocess.check_output(revlist_cmd, text=True).strip().splitlines()
    return list(reversed(revlist))

def main(argv):
    parser = argparse.ArgumentParser(
        prog='visualize-historical',
        description='Visualize historical data in LNT format. This program generates a HTML file that embeds an '
                    'interactive plot with the provided data. The HTML file can then be opened in a browser to '
                    'visualize the data as a chart.',
        epilog='This script depends on the modules listed in `libcxx/utils/requirements.txt`.')
    parser.add_argument('directory', type=directory_path,
        help='Path to a valid directory containing benchmark data in LNT format, each file being named <commit>.lnt. '
             'This is also the format generated by the `benchmark-historical` utility.')
    parser.add_argument('--output', '-o', type=pathlib.Path, required=False,
        help='Optional path where to output the resulting HTML file. If it already exists, it is overwritten. '
             'Defaults to a temporary file which is opened automatically once generated, but not removed after '
             'creation.')
    parser.add_argument('--metric', type=str, default='execution_time',
        help='The metric to compare. LNT data may contain multiple metrics (e.g. code size, execution time, etc) -- '
             'this option allows selecting which metric is being visualized. The default is "execution_time".')
    parser.add_argument('--filter', type=str, required=False,
        help='An optional regular expression used to filter the benchmarks included in the chart. '
             'Only benchmarks whose names match the regular expression will be included. '
             'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
             'and to then filter them in the browser, but in some cases producing a chart with a reduced '
             'number of data series is useful.')
    parser.add_argument('--subtitle', type=str, required=False,
        help='Optional subtitle for the chart. This can be used to help identify the contents of the chart.')
    parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
        help='Path to the git repository to use for ordering commits in time. '
             'By default, the current working directory is used.')
    parser.add_argument('--open', action='store_true',
        help='Whether to automatically open the generated HTML file when finished. If no output file is provided, '
             'the resulting benchmark is opened automatically by default.')
    parser.add_argument('--trendline', type=str, required=False, default=None, choices=('ols', 'lowess', 'expanding'),
        help='Optional trendline to add on each series in the chart. See the documentation in '
             'https://plotly.com/python-api-reference/generated/plotly.express.trendline_functions.html '
             'details on each option.')
    args = parser.parse_args(argv)
    repo = git.Repo(args.git_repo)

    # Extract benchmark data from the directory.
    data = {}
    files = [f for f in args.directory.glob('*.lnt')]
    for file in tqdm.tqdm(files, desc='Parsing LNT files'):
        rows = parse_lnt(file.read_text().splitlines())
        (commit, _) = os.path.splitext(os.path.basename(file))
        commit = Commit(repo, commit)
        data[commit] = rows

    # Obtain commit information which is then cached throughout the program. Do this
    # eagerly so we can provide a progress bar.
    for commit in tqdm.tqdm(data.keys(), desc='Prefetching Git information'):
        commit.prefetch()

    # Create a dataframe from the raw data and add some columns to it:
    # - 'commit' represents the Commit object associated to the results in that row
    # - `revlist_order` represents the order of the commit within the Git repository.
    # - `date` represents the commit date
    revlist = sorted_revlist(args.git_repo, [c.fullrev for c in data.keys()])
    data = pandas.DataFrame([row | {'commit': c} for (c, rows) in data.items() for row in rows])
    data = data.join(pandas.DataFrame([{'revlist_order': revlist.index(c.fullrev)} for c in data['commit']]))
    data = data.join(pandas.DataFrame([{'date': c.commit_date} for c in data['commit']]))

    # Filter the benchmarks if needed.
    if args.filter is not None:
        keeplist = [b for b in data['benchmark'] if re.search(args.filter, b) is not None]
        data = data[data['benchmark'].isin(keeplist)]
        if len(data) == 0:
            raise RuntimeError(f'Filter "{args.filter}" resulted in empty data set -- nothing to plot')

    # Plot the data for all the required benchmarks.
    figure = create_plot(data, args.metric, trendline=args.trendline, subtitle=args.subtitle)
    do_open = args.output is None or args.open
    output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
    plotly.io.write_html(figure, file=output, auto_open=do_open)

if __name__ == '__main__':
    main(sys.argv[1:])