From 636f34ea5614c5f6834580ee68a50163e542459f Mon Sep 17 00:00:00 2001 From: Thomas Symalla Date: Sun, 18 Jan 2026 21:18:39 +0100 Subject: [PATCH] [regex][FileCheck] Support back-references up to 20. (#174150) Support `\g{n}`-style back references in `regcomp` as well by increasing the limit from 9 to 20 and adding additional parsing. Update the limit checks in FileCheck. The limit can theoretically be removed by reallocating the regex-matchers internal arrays but I don't find a use case for that as of now. Update a test that now should pass when using more than 9 back-references. Add a new test that tests for the error message explicitly.. --- llvm/lib/FileCheck/FileCheck.cpp | 17 ++++++--- llvm/lib/Support/Regex.cpp | 2 +- llvm/lib/Support/regcomp.c | 52 ++++++++++++++++++--------- llvm/test/FileCheck/backref-limit.txt | 7 ++++ llvm/test/FileCheck/capture-limit.txt | 7 ++-- llvm/unittests/Support/RegexTest.cpp | 19 ++++++++++ 6 files changed, 78 insertions(+), 26 deletions(-) create mode 100644 llvm/test/FileCheck/backref-limit.txt diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp index bd43179704fa..d50e5d9cb088 100644 --- a/llvm/lib/FileCheck/FileCheck.cpp +++ b/llvm/lib/FileCheck/FileCheck.cpp @@ -28,6 +28,8 @@ using namespace llvm; +constexpr static int BackrefLimit = 20; + StringRef ExpressionFormat::toString() const { switch (Value) { case Kind::NoFormat: @@ -1054,10 +1056,11 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix, if (!IsNumBlock && (It = VariableDefs.find(SubstStr)) != VariableDefs.end()) { unsigned CaptureParenGroup = It->second; - if (CaptureParenGroup < 1 || CaptureParenGroup > 9) { + if (CaptureParenGroup < 1 || CaptureParenGroup > BackrefLimit) { SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()), SourceMgr::DK_Error, - "Can't back-reference more than 9 variables"); + "Can't back-reference more than " + + Twine(BackrefLimit) + " variables"); return true; } AddBackrefToRegEx(CaptureParenGroup); @@ -1108,8 +1111,14 @@ bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) { } void Pattern::AddBackrefToRegEx(unsigned BackrefNum) { - assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number"); - std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum); + assert(BackrefNum >= 1 && BackrefNum <= BackrefLimit && + "Invalid backref number"); + std::string Backref; + if (BackrefNum >= 1 && BackrefNum <= 9) + Backref = std::string("\\") + std::string(1, '0' + BackrefNum); + else + Backref = std::string("\\g{") + std::to_string(BackrefNum) + '}'; + RegExStr += Backref; } diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp index 5eedf95c48e3..5a96f1974341 100644 --- a/llvm/lib/Support/Regex.cpp +++ b/llvm/lib/Support/Regex.cpp @@ -154,7 +154,7 @@ std::string Regex::sub(StringRef Repl, StringRef String, // Add the skipped substring. Res += Split.first; - // Check for terminimation and trailing backslash. + // Check for termination and trailing backslash. if (Split.second.empty()) { if (Repl.size() != Split.first.size() && Error && Error->empty()) diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c index f5c47781f3d8..6838b96a0e5d 100644 --- a/llvm/lib/Support/regcomp.c +++ b/llvm/lib/Support/regcomp.c @@ -192,7 +192,7 @@ struct parse { sopno slen; /* malloced strip length (used) */ int ncsalloc; /* number of csets allocated */ struct re_guts *g; -#define NPAREN 10 /* we need to remember () 1-9 for back refs */ +#define NPAREN 21 /* we need to remember () 1-20 for back refs */ sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ sopno pend[NPAREN]; /* -> ) ([0] unused) */ }; @@ -506,27 +506,47 @@ static void p_ere_exp(struct parse *p) { * least 4 matching groups specified in the pattern previously). */ backrefnum = c - '0'; - if (p->pend[backrefnum] == 0) { - SETERROR(REG_ESUBREG); - break; - } - - /* Make sure everything checks out and emit the sequence - * that marks a back-reference to the parse structure. + } else if (c == 'g') { + /* Support back-references with index greater 9. + * These look like that: \g{n} + * Extract the number inside the brackets. */ - assert(backrefnum <= p->g->nsub); - EMIT(OBACK_, backrefnum); - assert(p->pbegin[backrefnum] != 0); - assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN); - assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN); - (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]); - EMIT(O_BACK, backrefnum); - p->g->backrefs = 1; + MUSTEAT('{', REG_BADRPT); + + backrefnum = 0; + while (MORE() && isdigit(PEEK())) { + c = GETNEXT(); + backrefnum = backrefnum * 10 + c - '0'; + } + MUSTEAT('}', REG_BADRPT); } else { /* Other chars are simply themselves when escaped with a backslash. */ ordinary(p, c); + break; } + + if (backrefnum >= NPAREN) { + SETERROR(REG_ESUBREG); + break; + } + + if (p->pend[backrefnum] == 0) { + SETERROR(REG_ESUBREG); + break; + } + + /* Make sure everything checks out and emit the sequence + * that marks a back-reference to the parse structure. + */ + assert(backrefnum <= p->g->nsub); + EMIT(OBACK_, backrefnum); + assert(p->pbegin[backrefnum] != 0); + assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN); + assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN); + (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]); + EMIT(O_BACK, backrefnum); + p->g->backrefs = 1; break; case '{': /* okay as ordinary except if digit follows */ REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); diff --git a/llvm/test/FileCheck/backref-limit.txt b/llvm/test/FileCheck/backref-limit.txt new file mode 100644 index 000000000000..890630e6e471 --- /dev/null +++ b/llvm/test/FileCheck/backref-limit.txt @@ -0,0 +1,7 @@ +; RUN: not FileCheck -check-prefix=CHECK-BACKREF %s < /dev/null 2>&1 | FileCheck -check-prefix=ERR-CHECK-BACKREF %s + +; ERR-CHECK-BACKREF: error: Can't back-reference more than 20 variables + +r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 + +; CHECK-BACKREF: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG11:r10]] [[REG12:r11]] [[REG13:r12]] [[REG14:r13]] [[REG15:r14]] [[REG16:r15]] [[REG17:r16]] [[REG18:r17]] [[REG19:r18]] [[REG20:r19]] [[REG21:r20]] [[REG21]] diff --git a/llvm/test/FileCheck/capture-limit.txt b/llvm/test/FileCheck/capture-limit.txt index a727be0c781f..7ccb122b71c3 100644 --- a/llvm/test/FileCheck/capture-limit.txt +++ b/llvm/test/FileCheck/capture-limit.txt @@ -1,8 +1,5 @@ ; RUN: FileCheck -input-file %s %s -; XFAIL: * -; Trying to back-reference more than 9 variables is intended to fail. +r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 r8 -r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 - -; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] +; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] [[REG9]] diff --git a/llvm/unittests/Support/RegexTest.cpp b/llvm/unittests/Support/RegexTest.cpp index c6ac42591d1f..18a78b731f3e 100644 --- a/llvm/unittests/Support/RegexTest.cpp +++ b/llvm/unittests/Support/RegexTest.cpp @@ -106,6 +106,25 @@ TEST_F(RegexTest, Backreferences) { EXPECT_EQ(2u, Matches.size()); EXPECT_FALSE(r6.match("abc_ab", &Matches)); EXPECT_FALSE(r6.match("abc_xyz", &Matches)); + + Matches.clear(); + Regex r7("(a)|(b)|(c)|(d)|(e)|(f)|(g)|(h)|(i)|(j)_\\g{10}"); + EXPECT_TRUE(r7.match("j_j", &Matches)); + EXPECT_FALSE(r7.match("k_k", &Matches)); + EXPECT_FALSE(r7.match("j_k", &Matches)); + EXPECT_EQ(11u, Matches.size()); + + std::string Error; + + Matches.clear(); + Regex r8("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{21}"); + EXPECT_FALSE(r8.match("j_j", &Matches, &Error)); + EXPECT_EQ(Error, "invalid backreference number"); + + Matches.clear(); + Regex r9("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{20}"); + r9.match("n_n", &Matches); + EXPECT_EQ(0u, Matches.size()); } TEST_F(RegexTest, Substitution) {