This change can be seen as code cleanup but motivation is more performance related.
While browsing perf reports captured during Linux build we can notice unusual portion of instructions executed in std::vector<std::string> copy constructor like:
0.59% 0.58% clang-14 clang-14 [.] std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >,
std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::vector
or even:
1.42% 0.26% clang clang-14 [.] clang::LangOptions::LangOptions
|
--1.16%--clang::LangOptions::LangOptions
|
--0.74%--std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >,
std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >::vector
After more digging we can see that relevant LangOptions std::vector members (*Files, ModuleFeatures and NoBuiltinFuncs)
are constructed when Lexer::LangOpts field is initialized on list:
Lexer::Lexer(..., const LangOptions &langOpts, ...)
: ..., LangOpts(langOpts),
Since LangOptions copy constructor is called by Lexer(..., const LangOptions &LangOpts,...) and local Lexer objects are created thousands times
(in Lexer::getRawToken, Preprocessor::EnterSourceFile and more) during single module processing in frontend it makes std::vector copy constructors surprisingly hot.
Unfortunately even though in current Lexer implementation mentioned std::vector members are unused and most of time empty,
no compiler is smart enough to optimize their std::vector copy constructors out (take a look at test assembly): https://godbolt.org/z/hdoxPfMYY even with LTO enabled.
However there is simple way to fix this. Since Lexer doesn't access *Files, ModuleFeatures, NoBuiltinFuncs and any other LangOptions fields (but only LangOptionsBase)
we can simply get rid of redundant copy constructor assembly by changing LangOpts type to more appropriate const LangOptions reference: https://godbolt.org/z/fP7de9176
Additionally we need to store LineComment outside LangOpts because it's written in SkipLineComment function.
Also FormatTokenLexer need to be adjusted a bit to avoid lifetime issues related to passing local LangOpts reference to Lexer.
After this change I can see more than 1% speedup in some of my microbenchmarks when using Clang release binary built with LTO.
For Linux build gains are not so significant but still nice at the level of -0.4%/-0.5% instructions drop.
Differential Revision: https://reviews.llvm.org/D120334
136 lines
4.0 KiB
C++
136 lines
4.0 KiB
C++
//===--- FormatTokenLexer.h - Format C++ code ----------------*- C++ ----*-===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file contains FormatTokenLexer, which tokenizes a source file
|
|
/// into a token stream suitable for ClangFormat.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
|
|
#define LLVM_CLANG_LIB_FORMAT_FORMATTOKENLEXER_H
|
|
|
|
#include "Encoding.h"
|
|
#include "FormatToken.h"
|
|
#include "clang/Basic/LangOptions.h"
|
|
#include "clang/Basic/SourceLocation.h"
|
|
#include "clang/Basic/SourceManager.h"
|
|
#include "clang/Format/Format.h"
|
|
#include "llvm/ADT/MapVector.h"
|
|
#include "llvm/ADT/StringSet.h"
|
|
#include "llvm/Support/Regex.h"
|
|
|
|
#include <stack>
|
|
|
|
namespace clang {
|
|
namespace format {
|
|
|
|
enum LexerState {
|
|
NORMAL,
|
|
TEMPLATE_STRING,
|
|
TOKEN_STASHED,
|
|
};
|
|
|
|
class FormatTokenLexer {
|
|
public:
|
|
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column,
|
|
const FormatStyle &Style, encoding::Encoding Encoding,
|
|
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
|
|
IdentifierTable &IdentTable);
|
|
|
|
ArrayRef<FormatToken *> lex();
|
|
|
|
const AdditionalKeywords &getKeywords() { return Keywords; }
|
|
|
|
private:
|
|
void tryMergePreviousTokens();
|
|
|
|
bool tryMergeLessLess();
|
|
bool tryMergeNSStringLiteral();
|
|
bool tryMergeJSPrivateIdentifier();
|
|
bool tryMergeCSharpStringLiteral();
|
|
bool tryMergeCSharpKeywordVariables();
|
|
bool tryMergeNullishCoalescingEqual();
|
|
bool tryTransformCSharpForEach();
|
|
bool tryMergeForEach();
|
|
bool tryTransformTryUsageForC();
|
|
|
|
bool tryMergeTokens(ArrayRef<tok::TokenKind> Kinds, TokenType NewType);
|
|
|
|
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
|
|
bool precedesOperand(FormatToken *Tok);
|
|
|
|
bool canPrecedeRegexLiteral(FormatToken *Prev);
|
|
|
|
// Tries to parse a JavaScript Regex literal starting at the current token,
|
|
// if that begins with a slash and is in a location where JavaScript allows
|
|
// regex literals. Changes the current token to a regex literal and updates
|
|
// its text if successful.
|
|
void tryParseJSRegexLiteral();
|
|
|
|
// Handles JavaScript template strings.
|
|
//
|
|
// JavaScript template strings use backticks ('`') as delimiters, and allow
|
|
// embedding expressions nested in ${expr-here}. Template strings can be
|
|
// nested recursively, i.e. expressions can contain template strings in turn.
|
|
//
|
|
// The code below parses starting from a backtick, up to a closing backtick or
|
|
// an opening ${. It also maintains a stack of lexing contexts to handle
|
|
// nested template parts by balancing curly braces.
|
|
void handleTemplateStrings();
|
|
|
|
void handleCSharpVerbatimAndInterpolatedStrings();
|
|
|
|
void tryParsePythonComment();
|
|
|
|
bool tryMerge_TMacro();
|
|
|
|
bool tryMergeConflictMarkers();
|
|
|
|
FormatToken *getStashedToken();
|
|
|
|
FormatToken *getNextToken();
|
|
|
|
FormatToken *FormatTok;
|
|
bool IsFirstToken;
|
|
std::stack<LexerState> StateStack;
|
|
unsigned Column;
|
|
unsigned TrailingWhitespace;
|
|
std::unique_ptr<Lexer> Lex;
|
|
LangOptions LangOpts;
|
|
const SourceManager &SourceMgr;
|
|
FileID ID;
|
|
const FormatStyle &Style;
|
|
IdentifierTable &IdentTable;
|
|
AdditionalKeywords Keywords;
|
|
encoding::Encoding Encoding;
|
|
llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
|
|
// Index (in 'Tokens') of the last token that starts a new line.
|
|
unsigned FirstInLineIndex;
|
|
SmallVector<FormatToken *, 16> Tokens;
|
|
|
|
llvm::SmallMapVector<IdentifierInfo *, TokenType, 8> Macros;
|
|
|
|
bool FormattingDisabled;
|
|
|
|
llvm::Regex MacroBlockBeginRegex;
|
|
llvm::Regex MacroBlockEndRegex;
|
|
|
|
// Targets that may appear inside a C# attribute.
|
|
static const llvm::StringSet<> CSharpAttributeTargets;
|
|
|
|
void readRawToken(FormatToken &Tok);
|
|
|
|
void resetLexer(unsigned Offset);
|
|
};
|
|
|
|
} // namespace format
|
|
} // namespace clang
|
|
|
|
#endif
|