# ===----------------------------------------------------------------------===## # # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # ===----------------------------------------------------------------------===## """A linter that detects potential typos in FileCheck directive names. Consider a broken test foo.cpp: // RUN: clang -cc1 -ast-dump %s | FileCheck %s --check-prefix=NEW // RUN: clang -cc1 -ast-dump %s -std=c++98 | FileCheck %s --check-prefix=OLD auto x = 42; // NEWW: auto is a c++11 extension // ODL-NOT: auto is a c++11 extension We first detect the locally valid FileCheck directive prefixes by parsing the --check-prefix flags. Here we get {CHECK, NEW, OLD}, so our directive names are {CHECK, NEW, OLD, CHECK-NOT, NEW-NOT, ...}. Then we look for lines that look like directives. These are of the form 'FOO:', usually at the beginning of a line or a comment. If any of these are a "near-miss" for a directive name, then we suspect this is a typo and report it. Usage: filecheck_lint path/to/test/file/1 ... path/to/test/file/n """ import itertools import logging import pathlib import re import sys from typing import Generator, Sequence, Tuple _distance_threshold = 3 _prefixes = {'CHECK'} _suffixes = {'-DAG', '-COUNT', '-EMPTY', '-LABEL', '-NEXT', '-NOT', '-SAME'} # 'NOTE' and 'TODO' are not directives, but are likely to be false positives # if encountered and to generate noise as a result. We filter them out also to # avoid this. _lit_directives = { 'RUN', 'REQUIRES', 'UNSUPPORTED', 'XFAIL', 'DEFINE', 'REDEFINE', } # 'COM' and 'RUN' are default comment prefixes for FileCheck. _comment_prefixes = {'COM', 'RUN'} _ignore = _lit_directives.union(_comment_prefixes).union({'NOTE', 'TODO'}) def levenshtein(s1: str, s2: str) -> int: # pylint: disable=g-doc-args """Computes the edit distance between two strings. Additions, deletions, and substitutions all count as a single operation. """ if not s1: return len(s2) if not s2: return len(s1) distances = range(len(s2) + 1) for i in range(len(s1)): new_distances = [i + 1] for j in range(len(s2)): cost = min(distances[j] + int(s1[i] != s2[j]), distances[j + 1] + 1, new_distances[-1] + 1) new_distances.append(cost) distances = new_distances return distances[-1] class FileRange: """Stores the coordinates of a span on a single line within a file. Attributes: line: the line number start_column: the (inclusive) column where the span starts end_column: the (inclusive) column where the span ends """ line: int start_column: int end_column: int def __init__(self, content: str, start_byte: int, end_byte: int): # pylint: disable=g-doc-args """Derives a span's coordinates based on a string and start/end bytes. `start_byte` and `end_byte` are assumed to be on the same line. """ content_before_span = content[:start_byte] self.line = content_before_span.count('\n') + 1 self.start_column = start_byte - content_before_span.rfind('\n') self.end_column = self.start_column + (end_byte - start_byte - 1) def __str__(self) -> str: return f'{self.line}:{self.start_column}-{self.end_column}' class Diagnostic: """Stores information about one typo and a suggested fix. Attributes: filepath: the path to the file in which the typo was found filerange: the position at which the typo was found in the file typo: the typo fix: a suggested fix """ filepath: pathlib.Path filerange: FileRange typo: str fix: str def __init__( self, filepath: pathlib.Path, filerange: FileRange, typo: str, fix: str # pylint: disable=redefined-outer-name ): self.filepath = filepath self.filerange = filerange self.typo = typo self.fix = fix def __str__(self) -> str: return f'{self.filepath}:' + str(self.filerange) + f': {self.summary()}' def summary(self) -> str: return ( f'Found potentially misspelled directive "{self.typo}". Did you mean ' f'"{self.fix}"?') def find_potential_directives( content: str,) -> Generator[Tuple[FileRange, str], None, None]: """Extracts all the potential FileCheck directives from a string. What constitutes a potential directive is loosely defined---we err on the side of capturing more strings than is necessary, rather than missing any. Args: content: the string in which to look for directives Yields: Tuples (p, d) where p is the span where the potential directive occurs within the string and d is the potential directive. """ directive_pattern = re.compile( r'(?:^|//|;|#)[^\d\w\-_]*([\d\w\-_][\s\d\w\-_]*):', re.MULTILINE) for match in re.finditer(directive_pattern, content): potential_directive, span = match.group(1), match.span(1) yield (FileRange(content, span[0], span[1]), potential_directive) # TODO(bchetioui): also parse comment prefixes to ignore. def parse_custom_prefixes(content: str) -> Generator[str, None, None]: # pylint: disable=g-doc-args """Parses custom prefixes defined in the string provided. For example, given the following file content: RUN: something | FileCheck %s -check-prefixes CHECK1,CHECK2 RUN: something_else | FileCheck %s -check-prefix 'CHECK3' the custom prefixes are CHECK1, CHECK2, and CHECK3. """ param_re = r'|'.join([r"'[^']*'", r'"[^"]*"', r'[^\'"\s]+']) for m in re.finditer(r'-check-prefix(?:es)?(?:\s+|=)({})'.format(param_re), content): prefixes = m.group(1) if prefixes.startswith('\'') or prefixes.startswith('"'): prefixes = prefixes[1:-1] for prefix in prefixes.split(','): yield prefix def find_directive_typos( content: str, filepath: pathlib.Path, threshold: int = 3, ) -> Generator[Diagnostic, None, None]: """Detects potential typos in FileCheck directives. Args: content: the content of the file filepath: the path to the file to check for typos in directives threshold: the (inclusive) maximum edit distance between a potential directive and an actual directive, such that the potential directive is classified as a typo Yields: Diagnostics, in order from the top of the file. """ all_prefixes = _prefixes.union(set(parse_custom_prefixes(content))) all_directives = ([ f'{prefix}{suffix}' for prefix, suffix in itertools.product(all_prefixes, _suffixes) ] + list(_ignore) + list(all_prefixes)) def find_best_match(typo): return min( [(threshold + 1, typo)] + [(levenshtein(typo, d), d) for d in all_directives if abs(len(d) - len(typo)) <= threshold], key=lambda tup: tup[0], ) potential_directives = find_potential_directives(content) for filerange, potential_directive in potential_directives: # TODO(bchetioui): match count directives more finely. We skip directives # starting with 'CHECK-COUNT-' for the moment as they require more complex # logic to be handled correctly. if any( potential_directive.startswith(f'{prefix}-COUNT-') for prefix in all_prefixes): continue # Ignoring potential typos that will not be matched later due to a too low # threshold, in order to avoid potentially long computation times. if len(potential_directive) > max(map(len, all_directives)) + threshold: continue score, best_match = find_best_match(potential_directive) if score == 0: # This is an actual directive, ignore. continue elif score <= threshold and best_match not in _ignore: yield Diagnostic(filepath, filerange, potential_directive, best_match) def main(argv: Sequence[str]): if len(argv) < 2: print(f'Usage: {argv[0]} path/to/file/1 ... path/to/file/n') exit(1) for filepath in argv[1:]: logging.info('Checking %s', filepath) with open(filepath, 'rt') as f: content = f.read() for diagnostic in find_directive_typos( content, pathlib.Path(filepath), threshold=_distance_threshold, ): print(diagnostic) if __name__ == '__main__': main(sys.argv)