[mlir] Make parser not rely on terminating null. (#151007)

Used in follow up to parse slices of buffer. (cherry picked from commit 217f9e57d1cc46de51d3b36177c4ba4049aaa805)
2025-07-29 04:59:46 +02:00 · 2025-07-29 04:59:46 +02:00 · 0eba8cf2fb
commit 0eba8cf2fb
parent 5d71f7c2ac
3 changed files with 34 additions and 3 deletions
--- a/mlir/lib/AsmParser/DialectSymbolParser.cpp
+++ b/mlir/lib/AsmParser/DialectSymbolParser.cpp
@ -89,6 +89,7 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
    nestedPunctuation.pop_back();
    return success();
  };
+  const char *curBufferEnd = state.lex.getBufferEnd();
  do {
    // Handle code completions, which may appear in the middle of the symbol
    // body.
@ -98,6 +99,12 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
      break;
    }

+    if (curBufferEnd == curPtr) {
+      if (!nestedPunctuation.empty())
+        return emitPunctError();
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    }
+
    char c = *curPtr++;
    switch (c) {
    case '\0':
--- a/mlir/lib/AsmParser/Lexer.cpp
+++ b/mlir/lib/AsmParser/Lexer.cpp
@ -37,6 +37,18 @@ Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
             AsmParserCodeCompleteContext *codeCompleteContext)
    : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
  auto bufferID = sourceMgr.getMainFileID();
+
+  // Check to see if the main buffer contains the last buffer, and if so the
+  // last buffer should be used as main file for parsing.
+  if (sourceMgr.getNumBuffers() > 1) {
+    unsigned lastFileID = sourceMgr.getNumBuffers();
+    const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
+    const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
+    if (main->getBufferStart() <= last->getBufferStart() &&
+        main->getBufferEnd() >= last->getBufferEnd()) {
+      bufferID = lastFileID;
+    }
+  }
  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
  curPtr = curBuffer.begin();

@ -71,6 +83,7 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }

 Token Lexer::lexToken() {
+  const char *curBufferEnd = curBuffer.end();
  while (true) {
    const char *tokStart = curPtr;

@ -78,6 +91,9 @@ Token Lexer::lexToken() {
    if (tokStart == codeCompleteLoc)
      return formToken(Token::code_complete, tokStart);

+    if (tokStart == curBufferEnd)
+      return formToken(Token::eof, tokStart);
+
    // Lex the next token.
    switch (*curPtr++) {
    default:
@ -102,7 +118,7 @@ Token Lexer::lexToken() {
    case 0:
      // This may either be a nul character in the source file or may be the EOF
      // marker that llvm::MemoryBuffer guarantees will be there.
-      if (curPtr - 1 == curBuffer.end())
+      if (curPtr - 1 == curBufferEnd)
        return formToken(Token::eof, tokStart);
      continue;

@ -259,7 +275,11 @@ void Lexer::skipComment() {
  assert(*curPtr == '/');
  ++curPtr;

+  const char *curBufferEnd = curBuffer.end();
  while (true) {
+    if (curPtr == curBufferEnd)
+      return;
+
    switch (*curPtr++) {
    case '\n':
    case '\r':
@ -267,7 +287,7 @@ void Lexer::skipComment() {
      return;
    case 0:
      // If this is the end of the buffer, end the comment.
-      if (curPtr - 1 == curBuffer.end()) {
+      if (curPtr - 1 == curBufferEnd) {
        --curPtr;
        return;
      }
@ -405,6 +425,7 @@ Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
 Token Lexer::lexString(const char *tokStart) {
  assert(curPtr[-1] == '"');

+  const char *curBufferEnd = curBuffer.end();
  while (true) {
    // Check to see if there is a code completion location within the string. In
    // these cases we generate a completion location and place the currently
@ -419,7 +440,7 @@ Token Lexer::lexString(const char *tokStart) {
    case 0:
      // If this is a random nul character in the middle of a string, just
      // include it.  If it is the end of file, then it is an error.
-      if (curPtr - 1 != curBuffer.end())
+      if (curPtr - 1 != curBufferEnd)
        continue;
      [[fallthrough]];
    case '\n':
--- a/mlir/lib/AsmParser/Lexer.h
+++ b/mlir/lib/AsmParser/Lexer.h
@ -40,6 +40,9 @@ public:
  /// Returns the start of the buffer.
  const char *getBufferBegin() { return curBuffer.data(); }

+  /// Returns the end of the buffer.
+  const char *getBufferEnd() { return curBuffer.end(); }
+
  /// Return the code completion location of the lexer, or nullptr if there is
  /// none.
  const char *getCodeCompleteLoc() const { return codeCompleteLoc; }