[regex][FileCheck] Support back-references up to 20. (#174150)
Support `\g{n}`-style back references in `regcomp` as well by increasing
the limit from 9 to 20 and adding additional parsing. Update the limit
checks in FileCheck. The limit can theoretically be removed by
reallocating the regex-matchers internal arrays but I don't find a use
case for that as of now.
Update a test that now should pass when using more than 9
back-references.
Add a new test that tests for the error message explicitly..
This commit is contained in:
parent
497a6d6722
commit
636f34ea56
@ -28,6 +28,8 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
constexpr static int BackrefLimit = 20;
|
||||
|
||||
StringRef ExpressionFormat::toString() const {
|
||||
switch (Value) {
|
||||
case Kind::NoFormat:
|
||||
@ -1054,10 +1056,11 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
|
||||
if (!IsNumBlock &&
|
||||
(It = VariableDefs.find(SubstStr)) != VariableDefs.end()) {
|
||||
unsigned CaptureParenGroup = It->second;
|
||||
if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
|
||||
if (CaptureParenGroup < 1 || CaptureParenGroup > BackrefLimit) {
|
||||
SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
|
||||
SourceMgr::DK_Error,
|
||||
"Can't back-reference more than 9 variables");
|
||||
"Can't back-reference more than " +
|
||||
Twine(BackrefLimit) + " variables");
|
||||
return true;
|
||||
}
|
||||
AddBackrefToRegEx(CaptureParenGroup);
|
||||
@ -1108,8 +1111,14 @@ bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
|
||||
}
|
||||
|
||||
void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
|
||||
assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
|
||||
std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
|
||||
assert(BackrefNum >= 1 && BackrefNum <= BackrefLimit &&
|
||||
"Invalid backref number");
|
||||
std::string Backref;
|
||||
if (BackrefNum >= 1 && BackrefNum <= 9)
|
||||
Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
|
||||
else
|
||||
Backref = std::string("\\g{") + std::to_string(BackrefNum) + '}';
|
||||
|
||||
RegExStr += Backref;
|
||||
}
|
||||
|
||||
|
||||
@ -154,7 +154,7 @@ std::string Regex::sub(StringRef Repl, StringRef String,
|
||||
// Add the skipped substring.
|
||||
Res += Split.first;
|
||||
|
||||
// Check for terminimation and trailing backslash.
|
||||
// Check for termination and trailing backslash.
|
||||
if (Split.second.empty()) {
|
||||
if (Repl.size() != Split.first.size() &&
|
||||
Error && Error->empty())
|
||||
|
||||
@ -192,7 +192,7 @@ struct parse {
|
||||
sopno slen; /* malloced strip length (used) */
|
||||
int ncsalloc; /* number of csets allocated */
|
||||
struct re_guts *g;
|
||||
#define NPAREN 10 /* we need to remember () 1-9 for back refs */
|
||||
#define NPAREN 21 /* we need to remember () 1-20 for back refs */
|
||||
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
|
||||
sopno pend[NPAREN]; /* -> ) ([0] unused) */
|
||||
};
|
||||
@ -506,27 +506,47 @@ static void p_ere_exp(struct parse *p) {
|
||||
* least 4 matching groups specified in the pattern previously).
|
||||
*/
|
||||
backrefnum = c - '0';
|
||||
if (p->pend[backrefnum] == 0) {
|
||||
SETERROR(REG_ESUBREG);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Make sure everything checks out and emit the sequence
|
||||
* that marks a back-reference to the parse structure.
|
||||
} else if (c == 'g') {
|
||||
/* Support back-references with index greater 9.
|
||||
* These look like that: \g{n}
|
||||
* Extract the number inside the brackets.
|
||||
*/
|
||||
assert(backrefnum <= p->g->nsub);
|
||||
EMIT(OBACK_, backrefnum);
|
||||
assert(p->pbegin[backrefnum] != 0);
|
||||
assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
|
||||
assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
|
||||
(void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
|
||||
EMIT(O_BACK, backrefnum);
|
||||
p->g->backrefs = 1;
|
||||
MUSTEAT('{', REG_BADRPT);
|
||||
|
||||
backrefnum = 0;
|
||||
while (MORE() && isdigit(PEEK())) {
|
||||
c = GETNEXT();
|
||||
backrefnum = backrefnum * 10 + c - '0';
|
||||
}
|
||||
MUSTEAT('}', REG_BADRPT);
|
||||
} else {
|
||||
/* Other chars are simply themselves when escaped with a backslash.
|
||||
*/
|
||||
ordinary(p, c);
|
||||
break;
|
||||
}
|
||||
|
||||
if (backrefnum >= NPAREN) {
|
||||
SETERROR(REG_ESUBREG);
|
||||
break;
|
||||
}
|
||||
|
||||
if (p->pend[backrefnum] == 0) {
|
||||
SETERROR(REG_ESUBREG);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Make sure everything checks out and emit the sequence
|
||||
* that marks a back-reference to the parse structure.
|
||||
*/
|
||||
assert(backrefnum <= p->g->nsub);
|
||||
EMIT(OBACK_, backrefnum);
|
||||
assert(p->pbegin[backrefnum] != 0);
|
||||
assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
|
||||
assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
|
||||
(void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
|
||||
EMIT(O_BACK, backrefnum);
|
||||
p->g->backrefs = 1;
|
||||
break;
|
||||
case '{': /* okay as ordinary except if digit follows */
|
||||
REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
|
||||
|
||||
7
llvm/test/FileCheck/backref-limit.txt
Normal file
7
llvm/test/FileCheck/backref-limit.txt
Normal file
@ -0,0 +1,7 @@
|
||||
; RUN: not FileCheck -check-prefix=CHECK-BACKREF %s < /dev/null 2>&1 | FileCheck -check-prefix=ERR-CHECK-BACKREF %s
|
||||
|
||||
; ERR-CHECK-BACKREF: error: Can't back-reference more than 20 variables
|
||||
|
||||
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21
|
||||
|
||||
; CHECK-BACKREF: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG11:r10]] [[REG12:r11]] [[REG13:r12]] [[REG14:r13]] [[REG15:r14]] [[REG16:r15]] [[REG17:r16]] [[REG18:r17]] [[REG19:r18]] [[REG20:r19]] [[REG21:r20]] [[REG21]]
|
||||
@ -1,8 +1,5 @@
|
||||
; RUN: FileCheck -input-file %s %s
|
||||
; XFAIL: *
|
||||
|
||||
; Trying to back-reference more than 9 variables is intended to fail.
|
||||
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 r8
|
||||
|
||||
r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9
|
||||
|
||||
; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]]
|
||||
; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] [[REG9]]
|
||||
|
||||
@ -106,6 +106,25 @@ TEST_F(RegexTest, Backreferences) {
|
||||
EXPECT_EQ(2u, Matches.size());
|
||||
EXPECT_FALSE(r6.match("abc_ab", &Matches));
|
||||
EXPECT_FALSE(r6.match("abc_xyz", &Matches));
|
||||
|
||||
Matches.clear();
|
||||
Regex r7("(a)|(b)|(c)|(d)|(e)|(f)|(g)|(h)|(i)|(j)_\\g{10}");
|
||||
EXPECT_TRUE(r7.match("j_j", &Matches));
|
||||
EXPECT_FALSE(r7.match("k_k", &Matches));
|
||||
EXPECT_FALSE(r7.match("j_k", &Matches));
|
||||
EXPECT_EQ(11u, Matches.size());
|
||||
|
||||
std::string Error;
|
||||
|
||||
Matches.clear();
|
||||
Regex r8("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{21}");
|
||||
EXPECT_FALSE(r8.match("j_j", &Matches, &Error));
|
||||
EXPECT_EQ(Error, "invalid backreference number");
|
||||
|
||||
Matches.clear();
|
||||
Regex r9("(a|b|c|d|e|f|g|h|i|j|k|l|m|n)_\\g{20}");
|
||||
r9.match("n_n", &Matches);
|
||||
EXPECT_EQ(0u, Matches.size());
|
||||
}
|
||||
|
||||
TEST_F(RegexTest, Substitution) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user