[regex][FileCheck] Support back-references up to 20. (#174150)

Support `\g{n}`-style back references in `regcomp` as well by increasing
the limit from 9 to 20 and adding additional parsing. Update the limit
checks in FileCheck. The limit can theoretically be removed by
reallocating the regex-matchers internal arrays but I don't find a use
case for that as of now.

Update a test that now should pass when using more than 9
back-references.

Add a new test that tests for the error message explicitly..
This commit is contained in:
Thomas Symalla 2026-01-18 21:18:39 +01:00 committed by GitHub
parent 497a6d6722
commit 636f34ea56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 78 additions and 26 deletions

View File

@ -28,6 +28,8 @@
using namespace llvm;
constexpr static int BackrefLimit = 20;
StringRef ExpressionFormat::toString() const {
switch (Value) {
case Kind::NoFormat:
@ -1054,10 +1056,11 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
if (!IsNumBlock &&
(It = VariableDefs.find(SubstStr)) != VariableDefs.end()) {
unsigned CaptureParenGroup = It->second;
if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
if (CaptureParenGroup < 1 || CaptureParenGroup > BackrefLimit) {
SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
SourceMgr::DK_Error,
"Can't back-reference more than 9 variables");
"Can't back-reference more than " +
Twine(BackrefLimit) + " variables");
return true;
}
AddBackrefToRegEx(CaptureParenGroup);
@ -1108,8 +1111,14 @@ bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
}
void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
assert(BackrefNum >= 1 && BackrefNum <= BackrefLimit &&
"Invalid backref number");
std::string Backref;
if (BackrefNum >= 1 && BackrefNum <= 9)
Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
else
Backref = std::string("\\g{") + std::to_string(BackrefNum) + '}';
RegExStr += Backref;
}

View File

@ -154,7 +154,7 @@ std::string Regex::sub(StringRef Repl, StringRef String,
// Add the skipped substring.
Res += Split.first;
// Check for terminimation and trailing backslash.
// Check for termination and trailing backslash.
if (Split.second.empty()) {
if (Repl.size() != Split.first.size() &&
Error && Error->empty())

View File

@ -192,7 +192,7 @@ struct parse {
sopno slen; /* malloced strip length (used) */
int ncsalloc; /* number of csets allocated */
struct re_guts *g;
#define NPAREN 10 /* we need to remember () 1-9 for back refs */
#define NPAREN 21 /* we need to remember () 1-20 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
};
@ -506,27 +506,47 @@ static void p_ere_exp(struct parse *p) {
* least 4 matching groups specified in the pattern previously).
*/
backrefnum = c - '0';
if (p->pend[backrefnum] == 0) {
SETERROR(REG_ESUBREG);
break;
}
/* Make sure everything checks out and emit the sequence
* that marks a back-reference to the parse structure.
} else if (c == 'g') {
/* Support back-references with index greater 9.
* These look like that: \g{n}
* Extract the number inside the brackets.
*/
assert(backrefnum <= p->g->nsub);
EMIT(OBACK_, backrefnum);
assert(p->pbegin[backrefnum] != 0);
assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
(void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
EMIT(O_BACK, backrefnum);
p->g->backrefs = 1;
MUSTEAT('{', REG_BADRPT);
backrefnum = 0;
while (MORE() && isdigit(PEEK())) {
c = GETNEXT();
backrefnum = backrefnum * 10 + c - '0';
}
MUSTEAT('}', REG_BADRPT);
} else {
/* Other chars are simply themselves when escaped with a backslash.
*/
ordinary(p, c);
break;
}
if (backrefnum >= NPAREN) {
SETERROR(REG_ESUBREG);
break;
}
if (p->pend[backrefnum] == 0) {
SETERROR(REG_ESUBREG);
break;
}
/* Make sure everything checks out and emit the sequence
* that marks a back-reference to the parse structure.
*/
assert(backrefnum <= p->g->nsub);
EMIT(OBACK_, backrefnum);
assert(p->pbegin[backrefnum] != 0);
assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
(void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
EMIT(O_BACK, backrefnum);
p->g->backrefs = 1;
break;
case '{': /* okay as ordinary except if digit follows */
REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);

View File

@ -0,0 +1,7 @@
; RUN: not FileCheck -check-prefix=CHECK-BACKREF %s < /dev/null 2>&1 | FileCheck -check-prefix=ERR-CHECK-BACKREF %s
; ERR-CHECK-BACKREF: error: Can't back-reference more than 20 variables
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21
; CHECK-BACKREF: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG11:r10]] [[REG12:r11]] [[REG13:r12]] [[REG14:r13]] [[REG15:r14]] [[REG16:r15]] [[REG17:r16]] [[REG18:r17]] [[REG19:r18]] [[REG20:r19]] [[REG21:r20]] [[REG21]]

View File

@ -1,8 +1,5 @@
; RUN: FileCheck -input-file %s %s
; XFAIL: *
; Trying to back-reference more than 9 variables is intended to fail.
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 r8
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9
; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]]
; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] [[REG9]]

View File

@ -106,6 +106,25 @@ TEST_F(RegexTest, Backreferences) {
EXPECT_EQ(2u, Matches.size());
EXPECT_FALSE(r6.match("abc_ab", &Matches));
EXPECT_FALSE(r6.match("abc_xyz", &Matches));
Matches.clear();
Regex r7("(a)|(b)|(c)|(d)|(e)|(f)|(g)|(h)|(i)|(j)_\\g{10}");
EXPECT_TRUE(r7.match("j_j", &Matches));
EXPECT_FALSE(r7.match("k_k", &Matches));
EXPECT_FALSE(r7.match("j_k", &Matches));
EXPECT_EQ(11u, Matches.size());
std::string Error;
Matches.clear();
Regex r8("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{21}");
EXPECT_FALSE(r8.match("j_j", &Matches, &Error));
EXPECT_EQ(Error, "invalid backreference number");
Matches.clear();
Regex r9("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{20}");
r9.match("n_n", &Matches);
EXPECT_EQ(0u, Matches.size());
}
TEST_F(RegexTest, Substitution) {