[clang] Make -dump-tokens option align tokens (#164894)

When using `-Xclang -dump-tokens`, the lexer dump output is currently difficult to read because the data are misaligned. The existing implementation simply separates the token name, spelling, flags, and location using `'\t'`, which results in inconsistent spacing. For example, the current output looks like this on provided in this patch example **(BEFORE THIS PR)**: <img width="2936" height="632" alt="image" src="https://github.com/user-attachments/assets/ad893958-6d57-4a76-8838-7fc56e37e6a7" /> # Changes This small PR improves the readability of the token dump by: + Adding padding after the token name and after the spelling (the padding amount was chosen empirically to produce good average alignment). + Swapping the order of location and flags (since flags can take up a lot of space and disrupt alignment). The result is a more readable output **(AFTER THIS PR)**: <img width="1470" height="315" alt="image" src="https://github.com/user-attachments/assets/c24f24e5-a431-42cc-b5b6-232bac5c635e" />
2026-04-03 15:33:36 +03:00 · 2026-04-03 15:33:36 +03:00 · b9924c76da
commit b9924c76da
parent a44c15874d
2 changed files with 88 additions and 10 deletions
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@ -61,6 +61,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Capacity.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/SaveAndRestore.h"
@ -240,14 +241,59 @@ void Preprocessor::FinalizeForModelFile() {
 }

 void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
-  llvm::errs() << tok::getTokenName(Tok.getKind());
+  std::string TokenStr;
+  llvm::raw_string_ostream OS(TokenStr);

-  if (!Tok.isAnnotation())
-    llvm::errs() << " '" << getSpelling(Tok) << "'";
+  // The alignment of 16 is chosen to comfortably fit most identifiers.
+  OS << llvm::formatv("{0,-16} ", tok::getTokenName(Tok.getKind()));
+
+  // Annotation tokens are just markers that don't have a spelling -- they
+  // indicate where something expanded.
+  if (!Tok.isAnnotation()) {
+    OS << "'";
+    // Escape string to prevent token spelling from spanning multiple lines.
+    OS.write_escaped(getSpelling(Tok));
+    OS << "'";
+  }
+
+  // The alignment of 48 (32 characters for the spelling + the 16 for
+  // the identifier name) fits most variable names, keywords and annotations.
+  llvm::errs() << llvm::formatv("{0,-48} ", OS.str());

  if (!DumpFlags) return;

-  llvm::errs() << "\t";
+  auto Loc = Tok.getLocation();
+  llvm::errs() << "Loc=<";
+  DumpLocation(Loc);
+  llvm::errs() << ">";
+
+  // If the token points directly to a file location (i.e. not a macro
+  // expansion), then add additional padding so that trailing markers
+  // align, provided the line/column numbers are reasonably sized.
+  //
+  // Otherwise, if it's a macro expansion, don't bother with alignment,
+  // as the line will include multiple locations and be very long.
+  //
+  // NOTE: To keep this stateless, it doesn't account for filename
+  // length, so when a header starts markers will be temporarily misaligned.
+  if (Loc.isFileID()) {
+    PresumedLoc PLoc = SourceMgr.getPresumedLoc(Loc);
+
+    if (!PLoc.isInvalid()) {
+      int LineWidth = llvm::utostr(PLoc.getLine()).size();
+      int ColumnWidth = llvm::utostr(PLoc.getColumn()).size();
+
+      // Reserve space for lines up to 9999 and columns up to 99,
+      // which is 4 + 2 = 6 characters in total.
+      const int ReservedSpace = 6;
+
+      int LeftSpace = ReservedSpace - LineWidth - ColumnWidth;
+      int Padding = std::max<int>(0, LeftSpace);
+
+      llvm::errs().indent(Padding);
+    }
+  }
+
  if (Tok.isAtStartOfLine())
    llvm::errs() << " [StartOfLine]";
  if (Tok.hasLeadingSpace())
@ -256,13 +302,8 @@ void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
    llvm::errs() << " [ExpandDisabled]";
  if (Tok.needsCleaning()) {
    const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
-    llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
-                 << "']";
+    llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength()) << "']";
  }
-
-  llvm::errs() << "\tLoc=<";
-  DumpLocation(Tok.getLocation());
-  llvm::errs() << ">";
 }

 void Preprocessor::DumpLocation(SourceLocation Loc) const {
--- a/clang/test/Preprocessor/dump-tokens.cpp
+++ b/clang/test/Preprocessor/dump-tokens.cpp
@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -dump-tokens %s 2>&1 | FileCheck %s --strict-whitespace
+
+// To make location reporting in the test more robust, provide line number and file name explicitly.
+#line 2 "dump-tokens.cpp"
+
+// Different kinds of identifiers with different spelling lengths.
+->                                  // CHECK:      arrow            '->'                            Loc=<{{.*}}:4:1>     [StartOfLine]
+5                                   // CHECK-NEXT: numeric_constant '5'                             Loc=<{{.*}}:5:1>     [StartOfLine]
+id                                  // CHECK-NEXT: identifier       'id'                            Loc=<{{.*}}:6:1>     [StartOfLine]
+&                                   // CHECK-NEXT: amp              '&'                             Loc=<{{.*}}:7:1>     [StartOfLine]
+)                                   // CHECK-NEXT: r_paren          ')'                             Loc=<{{.*}}:8:1>     [StartOfLine]
+unsigned                            // CHECK-NEXT: unsigned         'unsigned'                      Loc=<{{.*}}:9:1>     [StartOfLine]
+~                                   // CHECK-NEXT: tilde            '~'                             Loc=<{{.*}}:10:1>    [StartOfLine]
+long_variable_name_very_long        // CHECK-NEXT: identifier       'long_variable_name_very_long'  Loc=<{{.*}}:11:1>    [StartOfLine]
+union                               // CHECK-NEXT: union            'union'                         Loc=<{{.*}}:12:1>    [StartOfLine]
+42                                  // CHECK-NEXT: numeric_constant '42'                            Loc=<{{.*}}:13:1>    [StartOfLine]
+j                                   // CHECK-NEXT: identifier       'j'                             Loc=<{{.*}}:14:1>    [StartOfLine]
+&=                                  // CHECK-NEXT: ampequal         '&='                            Loc=<{{.*}}:15:1>    [StartOfLine]
+15                                  // CHECK-NEXT: numeric_constant '15'                            Loc=<{{.*}}:16:1>    [StartOfLine]
+
+// Different locations in line and trailing markers.
+ at different locations= in line    // CHECK-NEXT: identifier       'at'                            Loc=<{{.*}}:19:2>    [StartOfLine] [LeadingSpace]
+                                    // CHECK-NEXT: identifier       'different'                     Loc=<{{.*}}:19:5>    [LeadingSpace]
+                                    // CHECK-NEXT: identifier       'locations'                     Loc=<{{.*}}:19:15>   [LeadingSpace]
+                                    // CHECK-NEXT: equal            '='                             Loc=<{{.*}}:19:24>
+                                    // CHECK-NEXT: identifier       'in'                            Loc=<{{.*}}:19:26>   [LeadingSpace]
+                                    // CHECK-NEXT: identifier       'line'                          Loc=<{{.*}}:19:29>   [LeadingSpace]
+
+// Tokens that require escaping & annotations.
+#pragma clang __debug parser_crash  // CHECK-NEXT: annot_pragma_parser_crash                        Loc=<{{.*}}:27:23>
+                                    // CHECK-NEXT: eod              '\n'                            Loc=<{{.*}}:27:119>  [LeadingSpace]
+#pragma clang __debug captured      // CHECK-NEXT: annot_pragma_captured                            Loc=<{{.*}}:29:120>
+#pragma clang __debug dump X        // CHECK-NEXT: annot_pragma_dump                                Loc=<{{.*}}:30:23>
+                                    // CHECK-NEXT: identifier       'X'                             Loc=<{{.*}}:30:28>   [LeadingSpace]
+                                    // CHECK-NEXT: eod              '\n'                            Loc=<{{.*}}:30:119>  [LeadingSpace]
+                                    // CHECK-NEXT: eof              ''                              Loc=<{{.*}}:34:1>
+