llvm-project/clang-tools-extra/clang-tidy/misc/ConfusableIdentifierCheck.cpp
Richard Smith c56c349d39
[clang-tidy] Switch misc-confusable-identifiers check to a faster algorithm. (#130369)
Optimizations:

- Only build the skeleton for each identifier once, rather than once for
each declaration of that identifier.
- Only compute the contexts in which identifiers are declared for
identifiers that have the same skeleton as another identifier in the
translation unit.
- Only compare pairs of declarations that are declared in related
contexts, rather than comparing all pairs of declarations with the same
skeleton.

Also simplify by removing the caching of enclosing `DeclContext` sets,
because with the above changes we don't even compute the enclosing
`DeclContext` sets in common cases. Instead, we terminate the traversal
to enclosing `DeclContext`s immediately if we've already found another
declaration in that context with the same identifier. (This optimization
is not currently applied to the `forallBases` traversal, but could be
applied there too if needed.)

This also fixes two bugs that together caused the check to fail to find
some of the issues it was looking for:

- The old check skipped comparisons of declarations from different
contexts unless both declarations were type template parameters. This
caused the checker to not warn on some instances of the CVE it is
intended to detect.
- The old check skipped comparisons of declarations in all base classes
other than the first one found by the traversal. This appears to be an
oversight, incorrectly returning `false` rather than `true` from the
`forallBases` callback, which terminates traversal.

This also fixes an issue where the check would have false positives for
template parameters and function parameters in some cases, because those
parameters sometimes have a parent `DeclContext` that is the parent of
the parameterized entity, or sometimes is the translation unit. In
either case, this would cause warnings about declarations that are never
visible together in any scope.

This decreases the runtime of this check, especially in the common case
where there are few or no skeletons with two or more different
identifiers. Running this check over LLVM, clang, and clang-tidy, the
wall time for the check as reported by clang-tidy's internal profiler is
reduced from 5202.86s to 3900.90s.
2025-07-01 13:31:46 -07:00

265 lines
9.1 KiB
C++

//===--- ConfusableIdentifierCheck.cpp - clang-tidy -----------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "ConfusableIdentifierCheck.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/Lex/Preprocessor.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Support/ConvertUTF.h"
namespace {
// Preprocessed version of
// https://www.unicode.org/Public/security/latest/confusables.txt
//
// This contains a sorted array of { UTF32 codepoint; UTF32 values[N];}
#include "Confusables.inc"
} // namespace
namespace clang::tidy::misc {
ConfusableIdentifierCheck::ConfusableIdentifierCheck(StringRef Name,
ClangTidyContext *Context)
: ClangTidyCheck(Name, Context) {}
ConfusableIdentifierCheck::~ConfusableIdentifierCheck() = default;
// Build a skeleton out of the Original identifier, inspired by the algorithm
// described in http://www.unicode.org/reports/tr39/#def-skeleton
//
// FIXME: TR39 mandates:
//
// For an input string X, define skeleton(X) to be the following transformation
// on the string:
//
// 1. Convert X to NFD format, as described in [UAX15].
// 2. Concatenate the prototypes for each character in X according to the
// specified data, producing a string of exemplar characters.
// 3. Reapply NFD.
//
// We're skipping 1. and 3. for the sake of simplicity, but this can lead to
// false positive.
static llvm::SmallString<64U> skeleton(StringRef Name) {
using namespace llvm;
SmallString<64U> Skeleton;
Skeleton.reserve(1U + Name.size());
const char *Curr = Name.data();
const char *End = Curr + Name.size();
while (Curr < End) {
const char *Prev = Curr;
UTF32 CodePoint = 0;
ConversionResult Result = convertUTF8Sequence(
reinterpret_cast<const UTF8 **>(&Curr),
reinterpret_cast<const UTF8 *>(End), &CodePoint, strictConversion);
if (Result != conversionOK) {
errs() << "Unicode conversion issue\n";
break;
}
StringRef Key(Prev, Curr - Prev);
auto *Where = llvm::lower_bound(ConfusableEntries, CodePoint,
[](decltype(ConfusableEntries[0]) X,
UTF32 Y) { return X.codepoint < Y; });
if (Where == std::end(ConfusableEntries) || CodePoint != Where->codepoint) {
Skeleton.append(Prev, Curr);
} else {
UTF8 Buffer[32];
UTF8 *BufferStart = std::begin(Buffer);
UTF8 *IBuffer = BufferStart;
const UTF32 *ValuesStart = std::begin(Where->values);
const UTF32 *ValuesEnd = llvm::find(Where->values, '\0');
if (ConvertUTF32toUTF8(&ValuesStart, ValuesEnd, &IBuffer,
std::end(Buffer),
strictConversion) != conversionOK) {
errs() << "Unicode conversion issue\n";
break;
}
Skeleton.append((char *)BufferStart, (char *)IBuffer);
}
}
return Skeleton;
}
namespace {
struct Entry {
const NamedDecl *ND;
const Decl *Parent;
bool FromDerivedClass;
};
} // namespace
// Map from a context to the declarations in that context with the current
// skeleton. At most one entry per distinct identifier is tracked. The
// context is usually a `DeclContext`, but can also be a template declaration
// that has no corresponding context, such as an alias template or variable
// template.
using DeclsWithinContextMap =
llvm::DenseMap<const Decl *, llvm::SmallVector<Entry, 1>>;
static bool addToContext(DeclsWithinContextMap &DeclsWithinContext,
const Decl *Context, Entry E) {
auto &Decls = DeclsWithinContext[Context];
if (!Decls.empty() &&
Decls.back().ND->getIdentifier() == E.ND->getIdentifier()) {
// Already have a declaration with this identifier in this context. Don't
// track another one. This means that if an outer name is confusable with an
// inner name, we'll only diagnose the outer name once, pointing at the
// first inner declaration with that name.
if (Decls.back().FromDerivedClass && !E.FromDerivedClass) {
// Prefer the declaration that's not from the derived class, because that
// conflicts with more declarations.
Decls.back() = E;
return true;
}
return false;
}
Decls.push_back(E);
return true;
}
static void addToEnclosingContexts(DeclsWithinContextMap &DeclsWithinContext,
const Decl *Parent, const NamedDecl *ND) {
const Decl *Outer = Parent;
while (Outer) {
if (const auto *NS = dyn_cast<NamespaceDecl>(Outer))
Outer = NS->getCanonicalDecl();
if (!addToContext(DeclsWithinContext, Outer, {ND, Parent, false}))
return;
if (const auto *RD = dyn_cast<CXXRecordDecl>(Outer)) {
RD = RD->getDefinition();
if (RD) {
RD->forallBases([&](const CXXRecordDecl *Base) {
addToContext(DeclsWithinContext, Base, {ND, Parent, true});
return true;
});
}
}
auto *OuterDC = Outer->getDeclContext();
if (!OuterDC)
break;
Outer = cast_or_null<Decl>(OuterDC->getNonTransparentContext());
}
}
void ConfusableIdentifierCheck::check(
const ast_matchers::MatchFinder::MatchResult &Result) {
const auto *ND = Result.Nodes.getNodeAs<NamedDecl>("nameddecl");
if (!ND)
return;
addDeclToCheck(ND,
cast<Decl>(ND->getDeclContext()->getNonTransparentContext()));
// Associate template parameters with this declaration of this template.
if (const auto *TD = dyn_cast<TemplateDecl>(ND)) {
for (const NamedDecl *Param : *TD->getTemplateParameters())
addDeclToCheck(Param, TD->getTemplatedDecl());
}
// Associate function parameters with this declaration of this function.
if (const auto *FD = dyn_cast<FunctionDecl>(ND)) {
for (const NamedDecl *Param : FD->parameters())
addDeclToCheck(Param, ND);
}
}
void ConfusableIdentifierCheck::addDeclToCheck(const NamedDecl *ND,
const Decl *Parent) {
if (!ND || !Parent)
return;
const IdentifierInfo *NDII = ND->getIdentifier();
if (!NDII)
return;
StringRef NDName = NDII->getName();
if (NDName.empty())
return;
NameToDecls[NDII].push_back({ND, Parent});
}
void ConfusableIdentifierCheck::onEndOfTranslationUnit() {
llvm::StringMap<llvm::SmallVector<const IdentifierInfo *, 1>> SkeletonToNames;
// Compute the skeleton for each identifier.
for (auto &[Ident, Decls] : NameToDecls) {
SkeletonToNames[skeleton(Ident->getName())].push_back(Ident);
}
// Visit each skeleton with more than one identifier.
for (auto &[Skel, Idents] : SkeletonToNames) {
if (Idents.size() < 2) {
continue;
}
// Find the declaration contexts that transitively contain each identifier.
DeclsWithinContextMap DeclsWithinContext;
for (const IdentifierInfo *II : Idents) {
for (auto [ND, Parent] : NameToDecls[II]) {
addToEnclosingContexts(DeclsWithinContext, Parent, ND);
}
}
// Check to see if any declaration is declared in a context that
// transitively contains another declaration with a different identifier but
// the same skeleton.
for (const IdentifierInfo *II : Idents) {
for (auto [OuterND, OuterParent] : NameToDecls[II]) {
for (Entry Inner : DeclsWithinContext[OuterParent]) {
// Don't complain if the identifiers are the same.
if (OuterND->getIdentifier() == Inner.ND->getIdentifier())
continue;
// Don't complain about a derived-class name shadowing a base class
// private member.
if (OuterND->getAccess() == AS_private && Inner.FromDerivedClass)
continue;
// If the declarations are in the same context, only diagnose the
// later one.
if (OuterParent == Inner.Parent &&
Inner.ND->getASTContext()
.getSourceManager()
.isBeforeInTranslationUnit(Inner.ND->getLocation(),
OuterND->getLocation()))
continue;
diag(Inner.ND->getLocation(), "%0 is confusable with %1")
<< Inner.ND << OuterND;
diag(OuterND->getLocation(), "other declaration found here",
DiagnosticIDs::Note);
}
}
}
}
NameToDecls.clear();
}
void ConfusableIdentifierCheck::registerMatchers(
ast_matchers::MatchFinder *Finder) {
// Parameter declarations sometimes use the translation unit or some outer
// enclosing context as their `DeclContext`, instead of their parent, so
// we handle them specially in `check`.
auto AnyParamDecl = ast_matchers::anyOf(
ast_matchers::parmVarDecl(), ast_matchers::templateTypeParmDecl(),
ast_matchers::nonTypeTemplateParmDecl(),
ast_matchers::templateTemplateParmDecl());
Finder->addMatcher(ast_matchers::namedDecl(ast_matchers::unless(AnyParamDecl))
.bind("nameddecl"),
this);
}
} // namespace clang::tidy::misc