[ELF] Add VersionNode lexer state for better version script parsing

... so that `local:*;` will be lexed as three tokens instead of a single
one in a version node. This is used by both version scripts and dynamic
lists. Fix #174363

In addition, clean up special code for space-separated `local :` and `global :`.

This patch brings our lexer behavior closer to GNU ld. While GNU ld
additionally rejects more characters like `~/+,=`, we don't implement
this additional validation.

Pull Request: https://github.com/llvm/llvm-project/pull/174530
This commit is contained in:
Fangrui Song 2026-01-06 22:19:44 -08:00 committed by GitHub
parent 5a63367b15
commit 16be2c0555
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 62 additions and 21 deletions

View File

@ -124,37 +124,62 @@ void ScriptLexer::lex() {
return;
}
// Some operators form separate tokens.
if (s.starts_with("<<=") || s.starts_with(">>=")) {
curTok = s.substr(0, 3);
s = s.substr(3);
return;
}
if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
curTok = s.substr(0, 2);
s = s.substr(2);
return;
}
// In Script and Expr states, recognize compound assignment operators.
auto recognizeAssign = [&]() -> bool {
if (s.starts_with("<<=") || s.starts_with(">>=")) {
curTok = s.substr(0, 3);
s = s.substr(3);
return true;
}
if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) {
curTok = s.substr(0, 2);
s = s.substr(2);
return true;
}
return false;
};
// Unquoted token. The non-expression token is more relaxed than tokens in
// C-like languages, so that you can write "file-name.cpp" as one bare
// token.
size_t pos;
constexpr StringRef scriptAndVersionChars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$/\\~=+[]*?-!^:";
constexpr StringRef exprChars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$";
switch (lexState) {
case State::Script:
pos = s.find_first_not_of(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$/\\~=+[]*?-!^:");
if (recognizeAssign())
return;
pos = s.find_first_not_of(scriptAndVersionChars);
break;
case State::Expr:
pos = s.find_first_not_of(
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
"0123456789_.$");
if (recognizeAssign())
return;
pos = s.find_first_not_of(exprChars);
if (pos == 0 && s.size() >= 2 &&
((s[0] == s[1] && strchr("<>&|", s[0])) ||
is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2))))
pos = 2;
break;
case State::VersionNode:
// Treat `:` as a token boundary unless it's part of a scope operator `::`
// (for extern "C++"). This behavior resembles GNU ld and allows proper
// tokenization of patterns like `local:*`.
pos = 0;
for (; pos != s.size(); ++pos) {
if (s[pos] == ':') {
if (pos + 1 != s.size() && s[pos + 1] == ':') {
++pos;
continue;
}
} else if (scriptAndVersionChars.contains(s[pos]))
continue;
break;
}
break;
}
if (pos == 0)

View File

@ -44,6 +44,8 @@ protected:
enum class State {
Script,
Expr,
// Used by version node and dynamic list parsing.
VersionNode,
};
struct Token {

View File

@ -179,6 +179,7 @@ static ExprValue bitOr(LinkerScript &s, ExprValue a, ExprValue b) {
}
void ScriptParser::readDynamicList() {
SaveAndRestore saved(lexState, State::VersionNode);
expect("{");
SmallVector<SymbolVersion, 0> locals;
SmallVector<SymbolVersion, 0> globals;
@ -207,6 +208,7 @@ void ScriptParser::readVersionScript() {
}
void ScriptParser::readVersionScriptCommand() {
SaveAndRestore saved(lexState, State::VersionNode);
if (consume("{")) {
readAnonymousDeclaration();
return;
@ -1779,11 +1781,11 @@ ScriptParser::readSymbols() {
SmallVector<SymbolVersion, 0> ext = readVersionExtern();
v->insert(v->end(), ext.begin(), ext.end());
} else {
if (tok == "local:" || (tok == "local" && consume(":"))) {
if (tok == "local" && consume(":")) {
v = &locals;
continue;
}
if (tok == "global:" || (tok == "global" && consume(":"))) {
if (tok == "global" && consume(":")) {
v = &globals;
continue;
}

View File

@ -2,6 +2,7 @@
# Test that we can parse multiple externs.
# RUN: rm -rf %t && mkdir %t && cd %t
# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
# RUN: echo '{ extern "C" { foo; }; extern "C++" { bar; }; };' > %t.list
@ -9,3 +10,12 @@
# RUN: echo '{ extern "C" { foo }; extern "C++" { bar }; };' > %t.list
# RUN: ld.lld --dynamic-list %t.list %t.o -shared -o %t.so
# RUN: echo '{ extern "C++" { std::foo; }; };' > %t.list
# RUN: ld.lld --dynamic-list %t.list %t.o -shared -o %t.so
# RUN: echo '{ extern "C++" { std:foo; }; };' > a.list
# RUN: not ld.lld --dynamic-list a.list %t.o -shared 2>&1 | FileCheck %s --check-prefix=ERR-COLON
# RUN: echo '{ extern "C++" { std:::foo; }; };' > a.list
# RUN: not ld.lld --dynamic-list a.list %t.o -shared 2>&1 | FileCheck %s --check-prefix=ERR-COLON
# ERR-COLON: error: a.list:1: ; expected, but got :

View File

@ -6,7 +6,8 @@
# RUN: llvm-readobj -V %t.so | FileCheck %s
# RUN: echo "SECTIONS { .text : { bar = foo; *(.text) } }" > %t.script
# RUN: echo "VERSION { V { global: foo; bar; local: *; }; }" >> %t.script
## `:` in `local:*` is lexed as a separate token.
# RUN: echo "VERSION { V { global: foo; bar; local:*; }; }" >> %t.script
# RUN: ld.lld -T %t.script -shared --no-undefined-version %t.o -o %t.so
# RUN: llvm-readobj -V %t.so | FileCheck %s

View File

@ -8,8 +8,9 @@
# RUN: ld.lld --version-script %t.script -shared %t.o %t2.so -o %t.so --fatal-warnings
# RUN: llvm-readelf --dyn-syms %t.so | FileCheck --check-prefix=DSO %s
## `:` in `local:*` is lexed as a separate token.
# RUN: echo "# comment" > %t3.script
# RUN: echo "{ local: *; # comment" >> %t3.script
# RUN: echo "{ local:*; # comment" >> %t3.script
# RUN: echo -n "}; # comment" >> %t3.script
# RUN: ld.lld --version-script %t3.script -shared %t.o %t2.so -o %t3.so
# RUN: llvm-readelf --dyn-syms %t3.so | FileCheck --check-prefix=DSO2 %s