
This will allow moving the IncludeCleaner library essentials to Clang and decoupling them from the majority of clangd. The patch itself just moves the code, it doesn't change existing functionality. Reviewed By: sammccall Differential Revision: https://reviews.llvm.org/D119130
185 lines
6.5 KiB
Python
185 lines
6.5 KiB
Python
#!/usr/bin/env python
|
|
#===- cppreference_parser.py - ------------------------------*- python -*--===#
|
|
#
|
|
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
# See https://llvm.org/LICENSE.txt for license information.
|
|
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
#
|
|
#===------------------------------------------------------------------------===#
|
|
|
|
from bs4 import BeautifulSoup, NavigableString
|
|
|
|
import collections
|
|
import multiprocessing
|
|
import os
|
|
import re
|
|
import signal
|
|
import sys
|
|
|
|
|
|
class Symbol:
|
|
|
|
def __init__(self, name, namespace, headers):
|
|
# unqualifed symbol name, e.g. "move"
|
|
self.name = name
|
|
# namespace of the symbol (with trailing "::"), e.g. "std::", "" (global scope)
|
|
# None for C symbols.
|
|
self.namespace = namespace
|
|
# a list of corresponding headers
|
|
self.headers = headers
|
|
|
|
|
|
def _HasClass(tag, *classes):
|
|
for c in tag.get('class', []):
|
|
if c in classes:
|
|
return True
|
|
return False
|
|
|
|
|
|
def _ParseSymbolPage(symbol_page_html, symbol_name):
|
|
"""Parse symbol page and retrieve the include header defined in this page.
|
|
The symbol page provides header for the symbol, specifically in
|
|
"Defined in header <header>" section. An example:
|
|
|
|
<tr class="t-dsc-header">
|
|
<td colspan="2"> <div>Defined in header <code><ratio></code> </div>
|
|
</td></tr>
|
|
|
|
Returns a list of headers.
|
|
"""
|
|
headers = set()
|
|
all_headers = set()
|
|
|
|
soup = BeautifulSoup(symbol_page_html, "html.parser")
|
|
# Rows in table are like:
|
|
# Defined in header <foo> .t-dsc-header
|
|
# Defined in header <bar> .t-dsc-header
|
|
# decl1 .t-dcl
|
|
# Defined in header <baz> .t-dsc-header
|
|
# decl2 .t-dcl
|
|
for table in soup.select('table.t-dcl-begin, table.t-dsc-begin'):
|
|
current_headers = []
|
|
was_decl = False
|
|
for row in table.select('tr'):
|
|
if _HasClass(row, 't-dcl', 't-dsc'):
|
|
was_decl = True
|
|
# Symbols are in the first cell.
|
|
found_symbols = row.find('td').stripped_strings
|
|
if not symbol_name in found_symbols:
|
|
continue
|
|
headers.update(current_headers)
|
|
elif _HasClass(row, 't-dsc-header'):
|
|
# If we saw a decl since the last header, this is a new block of headers
|
|
# for a new block of decls.
|
|
if was_decl:
|
|
current_headers = []
|
|
was_decl = False
|
|
# There are also .t-dsc-header for "defined in namespace".
|
|
if not "Defined in header " in row.text:
|
|
continue
|
|
# The interesting header content (e.g. <cstdlib>) is wrapped in <code>.
|
|
for header_code in row.find_all("code"):
|
|
current_headers.append(header_code.text)
|
|
all_headers.add(header_code.text)
|
|
# If the symbol was never named, consider all named headers.
|
|
return headers or all_headers
|
|
|
|
|
|
def _ParseIndexPage(index_page_html):
|
|
"""Parse index page.
|
|
The index page lists all std symbols and hrefs to their detailed pages
|
|
(which contain the defined header). An example:
|
|
|
|
<a href="abs.html" title="abs"><tt>abs()</tt></a> (int) <br>
|
|
<a href="acos.html" title="acos"><tt>acos()</tt></a> <br>
|
|
|
|
Returns a list of tuple (symbol_name, relative_path_to_symbol_page, variant).
|
|
"""
|
|
symbols = []
|
|
soup = BeautifulSoup(index_page_html, "html.parser")
|
|
for symbol_href in soup.select("a[title]"):
|
|
# Ignore annotated symbols like "acos<>() (std::complex)".
|
|
# These tend to be overloads, and we the primary is more useful.
|
|
# This accidentally accepts begin/end despite the (iterator) caption: the
|
|
# (since C++11) note is first. They are good symbols, so the bug is unfixed.
|
|
caption = symbol_href.next_sibling
|
|
variant = None
|
|
if isinstance(caption, NavigableString) and "(" in caption:
|
|
variant = caption.text.strip(" ()")
|
|
symbol_tt = symbol_href.find("tt")
|
|
if symbol_tt:
|
|
symbols.append((symbol_tt.text.rstrip("<>()"), # strip any trailing <>()
|
|
symbol_href["href"], variant))
|
|
return symbols
|
|
|
|
|
|
def _ReadSymbolPage(path, name):
|
|
with open(path) as f:
|
|
return _ParseSymbolPage(f.read(), name)
|
|
|
|
|
|
def _GetSymbols(pool, root_dir, index_page_name, namespace, variants_to_accept):
|
|
"""Get all symbols listed in the index page. All symbols should be in the
|
|
given namespace.
|
|
|
|
Returns a list of Symbols.
|
|
"""
|
|
|
|
# Workflow steps:
|
|
# 1. Parse index page which lists all symbols to get symbol
|
|
# name (unqualified name) and its href link to the symbol page which
|
|
# contains the defined header.
|
|
# 2. Parse the symbol page to get the defined header.
|
|
index_page_path = os.path.join(root_dir, index_page_name)
|
|
with open(index_page_path, "r") as f:
|
|
# Read each symbol page in parallel.
|
|
results = [] # (symbol_name, promise of [header...])
|
|
for symbol_name, symbol_page_path, variant in _ParseIndexPage(f.read()):
|
|
# Variant symbols (e.g. the std::locale version of isalpha) add ambiguity.
|
|
# FIXME: use these as a fallback rather than ignoring entirely.
|
|
variants_for_symbol = variants_to_accept.get(
|
|
(namespace or "") + symbol_name, ())
|
|
if variant and variant not in variants_for_symbol:
|
|
continue
|
|
path = os.path.join(root_dir, symbol_page_path)
|
|
results.append((symbol_name,
|
|
pool.apply_async(_ReadSymbolPage, (path, symbol_name))))
|
|
|
|
# Build map from symbol name to a set of headers.
|
|
symbol_headers = collections.defaultdict(set)
|
|
for symbol_name, lazy_headers in results:
|
|
symbol_headers[symbol_name].update(lazy_headers.get())
|
|
|
|
symbols = []
|
|
for name, headers in sorted(symbol_headers.items(), key=lambda t : t[0]):
|
|
symbols.append(Symbol(name, namespace, list(headers)))
|
|
return symbols
|
|
|
|
|
|
def GetSymbols(parse_pages):
|
|
"""Get all symbols by parsing the given pages.
|
|
|
|
Args:
|
|
parse_pages: a list of tuples (page_root_dir, index_page_name, namespace)
|
|
"""
|
|
# By default we prefer the non-variant versions, as they're more common. But
|
|
# there are some symbols, whose variant is more common. This list describes
|
|
# those symbols.
|
|
variants_to_accept = {
|
|
# std::remove<> has variant algorithm.
|
|
"std::remove": ("algorithm"),
|
|
}
|
|
symbols = []
|
|
# Run many workers to process individual symbol pages under the symbol index.
|
|
# Don't allow workers to capture Ctrl-C.
|
|
pool = multiprocessing.Pool(
|
|
initializer=lambda: signal.signal(signal.SIGINT, signal.SIG_IGN))
|
|
try:
|
|
for root_dir, page_name, namespace in parse_pages:
|
|
symbols.extend(_GetSymbols(pool, root_dir, page_name, namespace,
|
|
variants_to_accept))
|
|
finally:
|
|
pool.terminate()
|
|
pool.join()
|
|
return symbols
|