
This enables specifying the extract modifier to extract all matches into a function. This currently does this very directly by converting all operands to function arguments (ones due to results of other matched ops are dropped) and all results as return values. Differential Revision: https://reviews.llvm.org/D158693
595 lines
19 KiB
C++
595 lines
19 KiB
C++
//===- Parser.cpp - Matcher expression parser -----------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// Recursive parser implementation for the matcher expression grammar.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "Parser.h"
|
|
|
|
#include <vector>
|
|
|
|
namespace mlir::query::matcher::internal {
|
|
|
|
// Simple structure to hold information for one token from the parser.
|
|
struct Parser::TokenInfo {
|
|
TokenInfo() = default;
|
|
|
|
// Method to set the kind and text of the token
|
|
void set(TokenKind newKind, llvm::StringRef newText) {
|
|
kind = newKind;
|
|
text = newText;
|
|
}
|
|
|
|
// Known identifiers.
|
|
static const char *const ID_Extract;
|
|
|
|
llvm::StringRef text;
|
|
TokenKind kind = TokenKind::Eof;
|
|
SourceRange range;
|
|
VariantValue value;
|
|
};
|
|
|
|
const char *const Parser::TokenInfo::ID_Extract = "extract";
|
|
|
|
class Parser::CodeTokenizer {
|
|
public:
|
|
// Constructor with matcherCode and error
|
|
explicit CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error)
|
|
: code(matcherCode), startOfLine(matcherCode), error(error) {
|
|
nextToken = getNextToken();
|
|
}
|
|
|
|
// Constructor with matcherCode, error, and codeCompletionOffset
|
|
CodeTokenizer(llvm::StringRef matcherCode, Diagnostics *error,
|
|
unsigned codeCompletionOffset)
|
|
: code(matcherCode), startOfLine(matcherCode), error(error),
|
|
codeCompletionLocation(matcherCode.data() + codeCompletionOffset) {
|
|
nextToken = getNextToken();
|
|
}
|
|
|
|
// Peek at next token without consuming it
|
|
const TokenInfo &peekNextToken() const { return nextToken; }
|
|
|
|
// Consume and return the next token
|
|
TokenInfo consumeNextToken() {
|
|
TokenInfo thisToken = nextToken;
|
|
nextToken = getNextToken();
|
|
return thisToken;
|
|
}
|
|
|
|
// Skip any newline tokens
|
|
TokenInfo skipNewlines() {
|
|
while (nextToken.kind == TokenKind::NewLine)
|
|
nextToken = getNextToken();
|
|
return nextToken;
|
|
}
|
|
|
|
// Consume and return next token, ignoring newlines
|
|
TokenInfo consumeNextTokenIgnoreNewlines() {
|
|
skipNewlines();
|
|
return nextToken.kind == TokenKind::Eof ? nextToken : consumeNextToken();
|
|
}
|
|
|
|
// Return kind of next token
|
|
TokenKind nextTokenKind() const { return nextToken.kind; }
|
|
|
|
private:
|
|
// Helper function to get the first character as a new StringRef and drop it
|
|
// from the original string
|
|
llvm::StringRef firstCharacterAndDrop(llvm::StringRef &str) {
|
|
assert(!str.empty());
|
|
llvm::StringRef firstChar = str.substr(0, 1);
|
|
str = str.drop_front();
|
|
return firstChar;
|
|
}
|
|
|
|
// Get next token, consuming whitespaces and handling different token types
|
|
TokenInfo getNextToken() {
|
|
consumeWhitespace();
|
|
TokenInfo result;
|
|
result.range.start = currentLocation();
|
|
|
|
// Code completion case
|
|
if (codeCompletionLocation && codeCompletionLocation <= code.data()) {
|
|
result.set(TokenKind::CodeCompletion,
|
|
llvm::StringRef(codeCompletionLocation, 0));
|
|
codeCompletionLocation = nullptr;
|
|
return result;
|
|
}
|
|
|
|
// End of file case
|
|
if (code.empty()) {
|
|
result.set(TokenKind::Eof, "");
|
|
return result;
|
|
}
|
|
|
|
// Switch to handle specific characters
|
|
switch (code[0]) {
|
|
case '#':
|
|
code = code.drop_until([](char c) { return c == '\n'; });
|
|
return getNextToken();
|
|
case ',':
|
|
result.set(TokenKind::Comma, firstCharacterAndDrop(code));
|
|
break;
|
|
case '.':
|
|
result.set(TokenKind::Period, firstCharacterAndDrop(code));
|
|
break;
|
|
case '\n':
|
|
++line;
|
|
startOfLine = code.drop_front();
|
|
result.set(TokenKind::NewLine, firstCharacterAndDrop(code));
|
|
break;
|
|
case '(':
|
|
result.set(TokenKind::OpenParen, firstCharacterAndDrop(code));
|
|
break;
|
|
case ')':
|
|
result.set(TokenKind::CloseParen, firstCharacterAndDrop(code));
|
|
break;
|
|
case '"':
|
|
case '\'':
|
|
consumeStringLiteral(&result);
|
|
break;
|
|
default:
|
|
parseIdentifierOrInvalid(&result);
|
|
break;
|
|
}
|
|
|
|
result.range.end = currentLocation();
|
|
return result;
|
|
}
|
|
|
|
// Consume a string literal, handle escape sequences and missing closing
|
|
// quote.
|
|
void consumeStringLiteral(TokenInfo *result) {
|
|
bool inEscape = false;
|
|
const char marker = code[0];
|
|
for (size_t length = 1; length < code.size(); ++length) {
|
|
if (inEscape) {
|
|
inEscape = false;
|
|
continue;
|
|
}
|
|
if (code[length] == '\\') {
|
|
inEscape = true;
|
|
continue;
|
|
}
|
|
if (code[length] == marker) {
|
|
result->kind = TokenKind::Literal;
|
|
result->text = code.substr(0, length + 1);
|
|
result->value = code.substr(1, length - 1);
|
|
code = code.drop_front(length + 1);
|
|
return;
|
|
}
|
|
}
|
|
llvm::StringRef errorText = code;
|
|
code = code.drop_front(code.size());
|
|
SourceRange range;
|
|
range.start = result->range.start;
|
|
range.end = currentLocation();
|
|
error->addError(range, ErrorType::ParserStringError) << errorText;
|
|
result->kind = TokenKind::Error;
|
|
}
|
|
|
|
void parseIdentifierOrInvalid(TokenInfo *result) {
|
|
if (isalnum(code[0])) {
|
|
// Parse an identifier
|
|
size_t tokenLength = 1;
|
|
|
|
while (true) {
|
|
// A code completion location in/immediately after an identifier will
|
|
// cause the portion of the identifier before the code completion
|
|
// location to become a code completion token.
|
|
if (codeCompletionLocation == code.data() + tokenLength) {
|
|
codeCompletionLocation = nullptr;
|
|
result->kind = TokenKind::CodeCompletion;
|
|
result->text = code.substr(0, tokenLength);
|
|
code = code.drop_front(tokenLength);
|
|
return;
|
|
}
|
|
if (tokenLength == code.size() || !(isalnum(code[tokenLength])))
|
|
break;
|
|
++tokenLength;
|
|
}
|
|
result->kind = TokenKind::Ident;
|
|
result->text = code.substr(0, tokenLength);
|
|
code = code.drop_front(tokenLength);
|
|
} else {
|
|
result->kind = TokenKind::InvalidChar;
|
|
result->text = code.substr(0, 1);
|
|
code = code.drop_front(1);
|
|
}
|
|
}
|
|
|
|
// Consume all leading whitespace from code, except newlines
|
|
void consumeWhitespace() { code = code.ltrim(" \t\v\f\r"); }
|
|
|
|
// Returns the current location in the source code
|
|
SourceLocation currentLocation() {
|
|
SourceLocation location;
|
|
location.line = line;
|
|
location.column = code.data() - startOfLine.data() + 1;
|
|
return location;
|
|
}
|
|
|
|
llvm::StringRef code;
|
|
llvm::StringRef startOfLine;
|
|
unsigned line = 1;
|
|
Diagnostics *error;
|
|
TokenInfo nextToken;
|
|
const char *codeCompletionLocation = nullptr;
|
|
};
|
|
|
|
Parser::Sema::~Sema() = default;
|
|
|
|
std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
|
|
llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
|
|
return {};
|
|
}
|
|
|
|
std::vector<MatcherCompletion>
|
|
Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
|
|
return {};
|
|
}
|
|
|
|
// Entry for the scope of a parser
|
|
struct Parser::ScopedContextEntry {
|
|
Parser *parser;
|
|
|
|
ScopedContextEntry(Parser *parser, MatcherCtor c) : parser(parser) {
|
|
parser->contextStack.emplace_back(c, 0u);
|
|
}
|
|
|
|
~ScopedContextEntry() { parser->contextStack.pop_back(); }
|
|
|
|
void nextArg() { ++parser->contextStack.back().second; }
|
|
};
|
|
|
|
// Parse and validate expressions starting with an identifier.
|
|
// This function can parse named values and matchers. In case of failure, it
|
|
// will try to determine the user's intent to give an appropriate error message.
|
|
bool Parser::parseIdentifierPrefixImpl(VariantValue *value) {
|
|
const TokenInfo nameToken = tokenizer->consumeNextToken();
|
|
|
|
if (tokenizer->nextTokenKind() != TokenKind::OpenParen) {
|
|
// Parse as a named value.
|
|
auto namedValue =
|
|
namedValues ? namedValues->lookup(nameToken.text) : VariantValue();
|
|
|
|
if (!namedValue.isMatcher()) {
|
|
error->addError(tokenizer->peekNextToken().range,
|
|
ErrorType::ParserNotAMatcher);
|
|
return false;
|
|
}
|
|
|
|
if (tokenizer->nextTokenKind() == TokenKind::NewLine) {
|
|
error->addError(tokenizer->peekNextToken().range,
|
|
ErrorType::ParserNoOpenParen)
|
|
<< "NewLine";
|
|
return false;
|
|
}
|
|
|
|
// If the syntax is correct and the name is not a matcher either, report
|
|
// an unknown named value.
|
|
if ((tokenizer->nextTokenKind() == TokenKind::Comma ||
|
|
tokenizer->nextTokenKind() == TokenKind::CloseParen ||
|
|
tokenizer->nextTokenKind() == TokenKind::NewLine ||
|
|
tokenizer->nextTokenKind() == TokenKind::Eof) &&
|
|
!sema->lookupMatcherCtor(nameToken.text)) {
|
|
error->addError(nameToken.range, ErrorType::RegistryValueNotFound)
|
|
<< nameToken.text;
|
|
return false;
|
|
}
|
|
// Otherwise, fallback to the matcher parser.
|
|
}
|
|
|
|
tokenizer->skipNewlines();
|
|
|
|
assert(nameToken.kind == TokenKind::Ident);
|
|
TokenInfo openToken = tokenizer->consumeNextToken();
|
|
if (openToken.kind != TokenKind::OpenParen) {
|
|
error->addError(openToken.range, ErrorType::ParserNoOpenParen)
|
|
<< openToken.text;
|
|
return false;
|
|
}
|
|
|
|
std::optional<MatcherCtor> ctor = sema->lookupMatcherCtor(nameToken.text);
|
|
|
|
// Parse as a matcher expression.
|
|
return parseMatcherExpressionImpl(nameToken, openToken, ctor, value);
|
|
}
|
|
|
|
bool Parser::parseChainedExpression(std::string &argument) {
|
|
// Parse the parenthesized argument to .extract("foo")
|
|
// Note: EOF is handled inside the consume functions and would fail below when
|
|
// checking token kind.
|
|
const TokenInfo openToken = tokenizer->consumeNextToken();
|
|
const TokenInfo argumentToken = tokenizer->consumeNextTokenIgnoreNewlines();
|
|
const TokenInfo closeToken = tokenizer->consumeNextTokenIgnoreNewlines();
|
|
|
|
if (openToken.kind != TokenKind::OpenParen) {
|
|
error->addError(openToken.range, ErrorType::ParserChainedExprNoOpenParen);
|
|
return false;
|
|
}
|
|
|
|
if (argumentToken.kind != TokenKind::Literal ||
|
|
!argumentToken.value.isString()) {
|
|
error->addError(argumentToken.range,
|
|
ErrorType::ParserChainedExprInvalidArg);
|
|
return false;
|
|
}
|
|
|
|
if (closeToken.kind != TokenKind::CloseParen) {
|
|
error->addError(closeToken.range, ErrorType::ParserChainedExprNoCloseParen);
|
|
return false;
|
|
}
|
|
|
|
// If all checks passed, extract the argument and return true.
|
|
argument = argumentToken.value.getString();
|
|
return true;
|
|
}
|
|
|
|
// Parse the arguments of a matcher
|
|
bool Parser::parseMatcherArgs(std::vector<ParserValue> &args, MatcherCtor ctor,
|
|
const TokenInfo &nameToken, TokenInfo &endToken) {
|
|
ScopedContextEntry sce(this, ctor);
|
|
|
|
while (tokenizer->nextTokenKind() != TokenKind::Eof) {
|
|
if (tokenizer->nextTokenKind() == TokenKind::CloseParen) {
|
|
// end of args.
|
|
endToken = tokenizer->consumeNextToken();
|
|
break;
|
|
}
|
|
|
|
if (!args.empty()) {
|
|
// We must find a , token to continue.
|
|
TokenInfo commaToken = tokenizer->consumeNextToken();
|
|
if (commaToken.kind != TokenKind::Comma) {
|
|
error->addError(commaToken.range, ErrorType::ParserNoComma)
|
|
<< commaToken.text;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
ParserValue argValue;
|
|
tokenizer->skipNewlines();
|
|
|
|
argValue.text = tokenizer->peekNextToken().text;
|
|
argValue.range = tokenizer->peekNextToken().range;
|
|
if (!parseExpressionImpl(&argValue.value)) {
|
|
return false;
|
|
}
|
|
|
|
tokenizer->skipNewlines();
|
|
args.push_back(argValue);
|
|
sce.nextArg();
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Parse and validate a matcher expression.
|
|
bool Parser::parseMatcherExpressionImpl(const TokenInfo &nameToken,
|
|
const TokenInfo &openToken,
|
|
std::optional<MatcherCtor> ctor,
|
|
VariantValue *value) {
|
|
if (!ctor) {
|
|
error->addError(nameToken.range, ErrorType::RegistryMatcherNotFound)
|
|
<< nameToken.text;
|
|
// Do not return here. We need to continue to give completion suggestions.
|
|
}
|
|
|
|
std::vector<ParserValue> args;
|
|
TokenInfo endToken;
|
|
|
|
tokenizer->skipNewlines();
|
|
|
|
if (!parseMatcherArgs(args, ctor.value_or(nullptr), nameToken, endToken)) {
|
|
return false;
|
|
}
|
|
|
|
// Check for the missing closing parenthesis
|
|
if (endToken.kind != TokenKind::CloseParen) {
|
|
error->addError(openToken.range, ErrorType::ParserNoCloseParen)
|
|
<< nameToken.text;
|
|
return false;
|
|
}
|
|
|
|
std::string functionName;
|
|
if (tokenizer->peekNextToken().kind == TokenKind::Period) {
|
|
tokenizer->consumeNextToken();
|
|
TokenInfo chainCallToken = tokenizer->consumeNextToken();
|
|
if (chainCallToken.kind == TokenKind::CodeCompletion) {
|
|
addCompletion(chainCallToken, MatcherCompletion("extract(\"", "extract"));
|
|
return false;
|
|
}
|
|
|
|
if (chainCallToken.kind != TokenKind::Ident ||
|
|
chainCallToken.text != TokenInfo::ID_Extract) {
|
|
error->addError(chainCallToken.range,
|
|
ErrorType::ParserMalformedChainedExpr);
|
|
return false;
|
|
}
|
|
|
|
if (chainCallToken.text == TokenInfo::ID_Extract &&
|
|
!parseChainedExpression(functionName))
|
|
return false;
|
|
}
|
|
|
|
if (!ctor)
|
|
return false;
|
|
// Merge the start and end infos.
|
|
SourceRange matcherRange = nameToken.range;
|
|
matcherRange.end = endToken.range.end;
|
|
VariantMatcher result = sema->actOnMatcherExpression(
|
|
*ctor, matcherRange, functionName, args, error);
|
|
if (result.isNull())
|
|
return false;
|
|
*value = result;
|
|
return true;
|
|
}
|
|
|
|
// If the prefix of this completion matches the completion token, add it to
|
|
// completions minus the prefix.
|
|
void Parser::addCompletion(const TokenInfo &compToken,
|
|
const MatcherCompletion &completion) {
|
|
if (llvm::StringRef(completion.typedText).starts_with(compToken.text)) {
|
|
completions.emplace_back(completion.typedText.substr(compToken.text.size()),
|
|
completion.matcherDecl);
|
|
}
|
|
}
|
|
|
|
std::vector<MatcherCompletion>
|
|
Parser::getNamedValueCompletions(llvm::ArrayRef<ArgKind> acceptedTypes) {
|
|
if (!namedValues)
|
|
return {};
|
|
|
|
std::vector<MatcherCompletion> result;
|
|
for (const auto &entry : *namedValues) {
|
|
std::string decl =
|
|
(entry.getValue().getTypeAsString() + " " + entry.getKey()).str();
|
|
result.emplace_back(entry.getKey(), decl);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void Parser::addExpressionCompletions() {
|
|
const TokenInfo compToken = tokenizer->consumeNextTokenIgnoreNewlines();
|
|
assert(compToken.kind == TokenKind::CodeCompletion);
|
|
|
|
// We cannot complete code if there is an invalid element on the context
|
|
// stack.
|
|
for (const auto &entry : contextStack) {
|
|
if (!entry.first)
|
|
return;
|
|
}
|
|
|
|
auto acceptedTypes = sema->getAcceptedCompletionTypes(contextStack);
|
|
for (const auto &completion : sema->getMatcherCompletions(acceptedTypes)) {
|
|
addCompletion(compToken, completion);
|
|
}
|
|
|
|
for (const auto &completion : getNamedValueCompletions(acceptedTypes)) {
|
|
addCompletion(compToken, completion);
|
|
}
|
|
}
|
|
|
|
// Parse an <Expresssion>
|
|
bool Parser::parseExpressionImpl(VariantValue *value) {
|
|
switch (tokenizer->nextTokenKind()) {
|
|
case TokenKind::Literal:
|
|
*value = tokenizer->consumeNextToken().value;
|
|
return true;
|
|
case TokenKind::Ident:
|
|
return parseIdentifierPrefixImpl(value);
|
|
case TokenKind::CodeCompletion:
|
|
addExpressionCompletions();
|
|
return false;
|
|
case TokenKind::Eof:
|
|
error->addError(tokenizer->consumeNextToken().range,
|
|
ErrorType::ParserNoCode);
|
|
return false;
|
|
|
|
case TokenKind::Error:
|
|
// This error was already reported by the tokenizer.
|
|
return false;
|
|
case TokenKind::NewLine:
|
|
case TokenKind::OpenParen:
|
|
case TokenKind::CloseParen:
|
|
case TokenKind::Comma:
|
|
case TokenKind::Period:
|
|
case TokenKind::InvalidChar:
|
|
const TokenInfo token = tokenizer->consumeNextToken();
|
|
error->addError(token.range, ErrorType::ParserInvalidToken)
|
|
<< (token.kind == TokenKind::NewLine ? "NewLine" : token.text);
|
|
return false;
|
|
}
|
|
|
|
llvm_unreachable("Unknown token kind.");
|
|
}
|
|
|
|
Parser::Parser(CodeTokenizer *tokenizer, const Registry &matcherRegistry,
|
|
const NamedValueMap *namedValues, Diagnostics *error)
|
|
: tokenizer(tokenizer),
|
|
sema(std::make_unique<RegistrySema>(matcherRegistry)),
|
|
namedValues(namedValues), error(error) {}
|
|
|
|
Parser::RegistrySema::~RegistrySema() = default;
|
|
|
|
std::optional<MatcherCtor>
|
|
Parser::RegistrySema::lookupMatcherCtor(llvm::StringRef matcherName) {
|
|
return RegistryManager::lookupMatcherCtor(matcherName, matcherRegistry);
|
|
}
|
|
|
|
VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
|
|
MatcherCtor ctor, SourceRange nameRange, llvm::StringRef functionName,
|
|
llvm::ArrayRef<ParserValue> args, Diagnostics *error) {
|
|
return RegistryManager::constructMatcher(ctor, nameRange, functionName, args,
|
|
error);
|
|
}
|
|
|
|
std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
|
|
llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> context) {
|
|
return RegistryManager::getAcceptedCompletionTypes(context);
|
|
}
|
|
|
|
std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
|
|
llvm::ArrayRef<ArgKind> acceptedTypes) {
|
|
return RegistryManager::getMatcherCompletions(acceptedTypes, matcherRegistry);
|
|
}
|
|
|
|
bool Parser::parseExpression(llvm::StringRef &code,
|
|
const Registry &matcherRegistry,
|
|
const NamedValueMap *namedValues,
|
|
VariantValue *value, Diagnostics *error) {
|
|
CodeTokenizer tokenizer(code, error);
|
|
Parser parser(&tokenizer, matcherRegistry, namedValues, error);
|
|
if (!parser.parseExpressionImpl(value))
|
|
return false;
|
|
auto nextToken = tokenizer.peekNextToken();
|
|
if (nextToken.kind != TokenKind::Eof &&
|
|
nextToken.kind != TokenKind::NewLine) {
|
|
error->addError(tokenizer.peekNextToken().range,
|
|
ErrorType::ParserTrailingCode);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
std::vector<MatcherCompletion>
|
|
Parser::completeExpression(llvm::StringRef &code, unsigned completionOffset,
|
|
const Registry &matcherRegistry,
|
|
const NamedValueMap *namedValues) {
|
|
Diagnostics error;
|
|
CodeTokenizer tokenizer(code, &error, completionOffset);
|
|
Parser parser(&tokenizer, matcherRegistry, namedValues, &error);
|
|
VariantValue dummy;
|
|
parser.parseExpressionImpl(&dummy);
|
|
|
|
return parser.completions;
|
|
}
|
|
|
|
std::optional<DynMatcher> Parser::parseMatcherExpression(
|
|
llvm::StringRef &code, const Registry &matcherRegistry,
|
|
const NamedValueMap *namedValues, Diagnostics *error) {
|
|
VariantValue value;
|
|
if (!parseExpression(code, matcherRegistry, namedValues, &value, error))
|
|
return std::nullopt;
|
|
if (!value.isMatcher()) {
|
|
error->addError(SourceRange(), ErrorType::ParserNotAMatcher);
|
|
return std::nullopt;
|
|
}
|
|
std::optional<DynMatcher> result = value.getMatcher().getDynMatcher();
|
|
if (!result) {
|
|
error->addError(SourceRange(), ErrorType::ParserOverloadedType)
|
|
<< value.getTypeAsString();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
} // namespace mlir::query::matcher::internal
|