From 16be2c0555080cdc0f0588dc0b4416e8c28d3cf9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Tue, 6 Jan 2026 22:19:44 -0800 Subject: [PATCH] [ELF] Add VersionNode lexer state for better version script parsing ... so that `local:*;` will be lexed as three tokens instead of a single one in a version node. This is used by both version scripts and dynamic lists. Fix #174363 In addition, clean up special code for space-separated `local :` and `global :`. This patch brings our lexer behavior closer to GNU ld. While GNU ld additionally rejects more characters like `~/+,=`, we don't implement this additional validation. Pull Request: https://github.com/llvm/llvm-project/pull/174530 --- lld/ELF/ScriptLexer.cpp | 59 +++++++++++++++------- lld/ELF/ScriptLexer.h | 2 + lld/ELF/ScriptParser.cpp | 6 ++- lld/test/ELF/dynamic-list-extern.s | 10 ++++ lld/test/ELF/linkerscript/version-script.s | 3 +- lld/test/ELF/version-script.s | 3 +- 6 files changed, 62 insertions(+), 21 deletions(-) diff --git a/lld/ELF/ScriptLexer.cpp b/lld/ELF/ScriptLexer.cpp index 3fa473882f3b..c16a70ba1ce8 100644 --- a/lld/ELF/ScriptLexer.cpp +++ b/lld/ELF/ScriptLexer.cpp @@ -124,37 +124,62 @@ void ScriptLexer::lex() { return; } - // Some operators form separate tokens. - if (s.starts_with("<<=") || s.starts_with(">>=")) { - curTok = s.substr(0, 3); - s = s.substr(3); - return; - } - if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { - curTok = s.substr(0, 2); - s = s.substr(2); - return; - } + // In Script and Expr states, recognize compound assignment operators. + auto recognizeAssign = [&]() -> bool { + if (s.starts_with("<<=") || s.starts_with(">>=")) { + curTok = s.substr(0, 3); + s = s.substr(3); + return true; + } + if (s.size() > 1 && (s[1] == '=' && strchr("+-*/!&^|", s[0]))) { + curTok = s.substr(0, 2); + s = s.substr(2); + return true; + } + return false; + }; // Unquoted token. The non-expression token is more relaxed than tokens in // C-like languages, so that you can write "file-name.cpp" as one bare // token. size_t pos; + constexpr StringRef scriptAndVersionChars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789_.$/\\~=+[]*?-!^:"; + constexpr StringRef exprChars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + "0123456789_.$"; switch (lexState) { case State::Script: - pos = s.find_first_not_of( - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - "0123456789_.$/\\~=+[]*?-!^:"); + if (recognizeAssign()) + return; + pos = s.find_first_not_of(scriptAndVersionChars); break; case State::Expr: - pos = s.find_first_not_of( - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" - "0123456789_.$"); + if (recognizeAssign()) + return; + pos = s.find_first_not_of(exprChars); if (pos == 0 && s.size() >= 2 && ((s[0] == s[1] && strchr("<>&|", s[0])) || is_contained({"==", "!=", "<=", ">=", "<<", ">>"}, s.substr(0, 2)))) pos = 2; break; + case State::VersionNode: + // Treat `:` as a token boundary unless it's part of a scope operator `::` + // (for extern "C++"). This behavior resembles GNU ld and allows proper + // tokenization of patterns like `local:*`. + pos = 0; + for (; pos != s.size(); ++pos) { + if (s[pos] == ':') { + if (pos + 1 != s.size() && s[pos + 1] == ':') { + ++pos; + continue; + } + } else if (scriptAndVersionChars.contains(s[pos])) + continue; + break; + } + break; } if (pos == 0) diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h index be691022f538..ba49155b9dc8 100644 --- a/lld/ELF/ScriptLexer.h +++ b/lld/ELF/ScriptLexer.h @@ -44,6 +44,8 @@ protected: enum class State { Script, Expr, + // Used by version node and dynamic list parsing. + VersionNode, }; struct Token { diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index b61dc647401a..07f3f786d7ce 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -179,6 +179,7 @@ static ExprValue bitOr(LinkerScript &s, ExprValue a, ExprValue b) { } void ScriptParser::readDynamicList() { + SaveAndRestore saved(lexState, State::VersionNode); expect("{"); SmallVector locals; SmallVector globals; @@ -207,6 +208,7 @@ void ScriptParser::readVersionScript() { } void ScriptParser::readVersionScriptCommand() { + SaveAndRestore saved(lexState, State::VersionNode); if (consume("{")) { readAnonymousDeclaration(); return; @@ -1779,11 +1781,11 @@ ScriptParser::readSymbols() { SmallVector ext = readVersionExtern(); v->insert(v->end(), ext.begin(), ext.end()); } else { - if (tok == "local:" || (tok == "local" && consume(":"))) { + if (tok == "local" && consume(":")) { v = &locals; continue; } - if (tok == "global:" || (tok == "global" && consume(":"))) { + if (tok == "global" && consume(":")) { v = &globals; continue; } diff --git a/lld/test/ELF/dynamic-list-extern.s b/lld/test/ELF/dynamic-list-extern.s index bb06cebf5f52..04e183763305 100644 --- a/lld/test/ELF/dynamic-list-extern.s +++ b/lld/test/ELF/dynamic-list-extern.s @@ -2,6 +2,7 @@ # Test that we can parse multiple externs. +# RUN: rm -rf %t && mkdir %t && cd %t # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o # RUN: echo '{ extern "C" { foo; }; extern "C++" { bar; }; };' > %t.list @@ -9,3 +10,12 @@ # RUN: echo '{ extern "C" { foo }; extern "C++" { bar }; };' > %t.list # RUN: ld.lld --dynamic-list %t.list %t.o -shared -o %t.so + +# RUN: echo '{ extern "C++" { std::foo; }; };' > %t.list +# RUN: ld.lld --dynamic-list %t.list %t.o -shared -o %t.so + +# RUN: echo '{ extern "C++" { std:foo; }; };' > a.list +# RUN: not ld.lld --dynamic-list a.list %t.o -shared 2>&1 | FileCheck %s --check-prefix=ERR-COLON +# RUN: echo '{ extern "C++" { std:::foo; }; };' > a.list +# RUN: not ld.lld --dynamic-list a.list %t.o -shared 2>&1 | FileCheck %s --check-prefix=ERR-COLON +# ERR-COLON: error: a.list:1: ; expected, but got : diff --git a/lld/test/ELF/linkerscript/version-script.s b/lld/test/ELF/linkerscript/version-script.s index 6b97fede00c3..22ea24364811 100644 --- a/lld/test/ELF/linkerscript/version-script.s +++ b/lld/test/ELF/linkerscript/version-script.s @@ -6,7 +6,8 @@ # RUN: llvm-readobj -V %t.so | FileCheck %s # RUN: echo "SECTIONS { .text : { bar = foo; *(.text) } }" > %t.script -# RUN: echo "VERSION { V { global: foo; bar; local: *; }; }" >> %t.script +## `:` in `local:*` is lexed as a separate token. +# RUN: echo "VERSION { V { global: foo; bar; local:*; }; }" >> %t.script # RUN: ld.lld -T %t.script -shared --no-undefined-version %t.o -o %t.so # RUN: llvm-readobj -V %t.so | FileCheck %s diff --git a/lld/test/ELF/version-script.s b/lld/test/ELF/version-script.s index 7fd3b3733625..1e8f70bd501a 100644 --- a/lld/test/ELF/version-script.s +++ b/lld/test/ELF/version-script.s @@ -8,8 +8,9 @@ # RUN: ld.lld --version-script %t.script -shared %t.o %t2.so -o %t.so --fatal-warnings # RUN: llvm-readelf --dyn-syms %t.so | FileCheck --check-prefix=DSO %s +## `:` in `local:*` is lexed as a separate token. # RUN: echo "# comment" > %t3.script -# RUN: echo "{ local: *; # comment" >> %t3.script +# RUN: echo "{ local:*; # comment" >> %t3.script # RUN: echo -n "}; # comment" >> %t3.script # RUN: ld.lld --version-script %t3.script -shared %t.o %t2.so -o %t3.so # RUN: llvm-readelf --dyn-syms %t3.so | FileCheck --check-prefix=DSO2 %s