To better ensure that bytecode `@update` implementations return a 0/1 value (see https://github.com/llvm/llvm-project/pull/181199), this changes the Python -> formatter bytecode compiler to require that Python `update` methods be declared to return `bool`. A declaration like this will be a compiler error: ```py def update(self): # implementation... ```
1334 lines
43 KiB
Python
1334 lines
43 KiB
Python
"""
|
|
Specification, assembler, disassembler, and interpreter
|
|
for LLDB dataformatter bytecode.
|
|
|
|
See https://lldb.llvm.org/resources/formatterbytecode.html for more details.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
# Work around the fact that one of the local files is called
|
|
# types.py, which breaks some versions of python.
|
|
import os, sys
|
|
|
|
path = os.path.abspath(os.path.dirname(__file__))
|
|
if path in sys.path:
|
|
sys.path.remove(path)
|
|
|
|
import re
|
|
import io
|
|
import ast
|
|
import enum
|
|
import shlex
|
|
import textwrap
|
|
from copy import copy
|
|
from dataclasses import dataclass
|
|
from typing import Any, BinaryIO, Optional, Sequence, TextIO, Tuple, Union, cast
|
|
|
|
BINARY_VERSION = 1
|
|
|
|
# Types
|
|
type_String = 1
|
|
type_Int = 2
|
|
type_UInt = 3
|
|
type_Object = 4
|
|
type_Type = 5
|
|
|
|
# Opcodes
|
|
opcode = dict()
|
|
|
|
|
|
def define_opcode(n, mnemonic, name):
|
|
globals()["op_" + name] = n
|
|
if mnemonic:
|
|
opcode[mnemonic] = n
|
|
opcode[n] = mnemonic
|
|
|
|
|
|
define_opcode(1, "dup", "dup")
|
|
define_opcode(2, "drop", "drop")
|
|
define_opcode(3, "pick", "pick")
|
|
define_opcode(4, "over", "over")
|
|
define_opcode(5, "swap", "swap")
|
|
define_opcode(6, "rot", "rot")
|
|
|
|
define_opcode(0x10, "{", "begin")
|
|
define_opcode(0x11, "if", "if")
|
|
define_opcode(0x12, "ifelse", "ifelse")
|
|
define_opcode(0x13, "return", "return")
|
|
|
|
define_opcode(0x20, None, "lit_uint")
|
|
define_opcode(0x21, None, "lit_int")
|
|
define_opcode(0x22, None, "lit_string")
|
|
define_opcode(0x23, None, "lit_selector")
|
|
|
|
define_opcode(0x2A, "as_int", "as_int")
|
|
define_opcode(0x2B, "as_uint", "as_uint")
|
|
define_opcode(0x2C, "is_null", "is_null")
|
|
|
|
define_opcode(0x30, "+", "plus")
|
|
define_opcode(0x31, "-", "minus")
|
|
define_opcode(0x32, "*", "mul")
|
|
define_opcode(0x33, "/", "div")
|
|
define_opcode(0x34, "%", "mod")
|
|
define_opcode(0x35, "<<", "shl")
|
|
define_opcode(0x36, ">>", "shr")
|
|
|
|
define_opcode(0x40, "&", "and")
|
|
define_opcode(0x41, "|", "or")
|
|
define_opcode(0x42, "^", "xor")
|
|
define_opcode(0x43, "~", "not")
|
|
|
|
define_opcode(0x50, "=", "eq")
|
|
define_opcode(0x51, "!=", "neq")
|
|
define_opcode(0x52, "<", "lt")
|
|
define_opcode(0x53, ">", "gt")
|
|
define_opcode(0x54, "=<", "le")
|
|
define_opcode(0x55, ">=", "ge")
|
|
|
|
define_opcode(0x60, "call", "call")
|
|
|
|
# Function signatures
|
|
sig_summary = 0
|
|
sig_init = 1
|
|
sig_get_num_children = 2
|
|
sig_get_child_index = 3
|
|
sig_get_child_at_index = 4
|
|
sig_get_value = 5
|
|
sig_update = 6
|
|
|
|
SIGNATURES = {
|
|
"summary": sig_summary,
|
|
"init": sig_init,
|
|
"get_num_children": sig_get_num_children,
|
|
"get_child_index": sig_get_child_index,
|
|
"get_child_at_index": sig_get_child_at_index,
|
|
"get_value": sig_get_value,
|
|
"update": sig_update,
|
|
}
|
|
|
|
SIGNATURE_NAMES = "|".join(SIGNATURES.keys())
|
|
SIGNATURE_IDS = {v: k for k, v in SIGNATURES.items()}
|
|
|
|
# Selectors
|
|
selector = dict()
|
|
|
|
|
|
def define_selector(n, name):
|
|
globals()["sel_" + name] = n
|
|
selector["@" + name] = n
|
|
selector[n] = "@" + name
|
|
|
|
|
|
define_selector(0, "summary")
|
|
define_selector(1, "type_summary")
|
|
|
|
define_selector(0x10, "get_num_children")
|
|
define_selector(0x11, "get_child_at_index")
|
|
define_selector(0x12, "get_child_with_name")
|
|
define_selector(0x13, "get_child_index")
|
|
define_selector(0x15, "get_type")
|
|
define_selector(0x16, "get_template_argument_type")
|
|
define_selector(0x17, "cast")
|
|
define_selector(0x18, "get_synthetic_value")
|
|
define_selector(0x19, "get_non_synthetic_value")
|
|
define_selector(0x20, "get_value")
|
|
define_selector(0x21, "get_value_as_unsigned")
|
|
define_selector(0x22, "get_value_as_signed")
|
|
define_selector(0x23, "get_value_as_address")
|
|
|
|
define_selector(0x40, "read_memory_byte")
|
|
define_selector(0x41, "read_memory_uint32")
|
|
define_selector(0x42, "read_memory_int32")
|
|
define_selector(0x43, "read_memory_unsigned")
|
|
define_selector(0x44, "read_memory_signed")
|
|
define_selector(0x45, "read_memory_address")
|
|
define_selector(0x46, "read_memory")
|
|
|
|
define_selector(0x50, "fmt")
|
|
define_selector(0x51, "sprintf")
|
|
define_selector(0x52, "strlen")
|
|
|
|
|
|
################################################################################
|
|
# Assembler.
|
|
################################################################################
|
|
|
|
_SIGNATURE_LABEL = re.compile(f"@(?:{SIGNATURE_NAMES}):$")
|
|
|
|
|
|
def _tokenize(assembler: str) -> list[str]:
|
|
"""Convert string of assembly into tokens."""
|
|
# With one exception, tokens are sequences of non-space characters.
|
|
# The one exception is string literals, which may have spaces.
|
|
|
|
# To parse strings, which can contain escaped contents, use a "Friedl
|
|
# unrolled loop". The high level of such a regex is:
|
|
# open normal* ( special normal* )* close
|
|
# which for string literals is:
|
|
string_literal = r'" [^"\\]* (?: \\. [^"\\]* )* "'
|
|
|
|
return re.findall(rf"{string_literal} | \S+", assembler, re.VERBOSE)
|
|
|
|
|
|
def _segment_by_signature(input: list[str]) -> list[Tuple[str, list[str]]]:
|
|
"""Segment the input tokens along signature labels."""
|
|
segments = []
|
|
|
|
# Loop state
|
|
signature = None
|
|
tokens = []
|
|
|
|
for token in input:
|
|
if _SIGNATURE_LABEL.match(token):
|
|
if signature:
|
|
segments.append((signature, tokens))
|
|
signature = token[1:-1] # strip leading @, trailing :
|
|
tokens = []
|
|
else:
|
|
tokens.append(token)
|
|
|
|
if signature:
|
|
segments.append((signature, tokens))
|
|
|
|
return segments
|
|
|
|
|
|
@dataclass
|
|
class BytecodeSection:
|
|
"""Abstraction of the data serialized to __lldbformatters sections."""
|
|
|
|
type_name: str
|
|
flags: int
|
|
signatures: list[Tuple[str, bytes]]
|
|
|
|
def validate(self):
|
|
seen = set()
|
|
for sig, _ in self.signatures:
|
|
if sig in seen:
|
|
raise ValueError(f"duplicate signature: {sig}")
|
|
seen.add(sig)
|
|
|
|
def _to_binary(self) -> bytes:
|
|
bin = bytearray()
|
|
bin.extend(_to_uleb(len(self.type_name)))
|
|
bin.extend(bytes(self.type_name, encoding="utf-8"))
|
|
bin.extend(_to_byte(self.flags))
|
|
for sig, bc in self.signatures:
|
|
bin.extend(_to_byte(SIGNATURES[sig]))
|
|
bin.extend(_to_uleb(len(bc)))
|
|
bin.extend(bc)
|
|
|
|
return bytes(bin)
|
|
|
|
def write_binary(self, output: BinaryIO) -> None:
|
|
self.validate()
|
|
|
|
bin = self._to_binary()
|
|
output.write(_to_byte(BINARY_VERSION))
|
|
output.write(_to_uleb(len(bin)))
|
|
output.write(self._to_binary())
|
|
|
|
def write_source(self, output: TextIO, language: str) -> None:
|
|
if language == "c":
|
|
self.write_c(output)
|
|
elif language == "swift":
|
|
self.write_swift(output)
|
|
|
|
class _CBuilder:
|
|
"""Helper class for emitting binary data as a C-string literal."""
|
|
|
|
entries: list[Tuple[str, str]]
|
|
|
|
def __init__(self) -> None:
|
|
self.entries = []
|
|
|
|
def emit_byte(self, x: int, comment: str) -> None:
|
|
self.emit_bytes(_to_byte(x), comment)
|
|
|
|
def emit_uleb(self, x: int, comment: str) -> None:
|
|
self.emit_bytes(_to_uleb(x), comment)
|
|
|
|
def emit_bytes(self, x: bytes, comment: str) -> None:
|
|
# Construct zero pemited hex values with length two.
|
|
string = "".join(f"\\x{b:02x}" for b in x)
|
|
self.emit_string(string, comment)
|
|
|
|
def emit_string(self, string: str, comment: str) -> None:
|
|
self.entries.append((f'"{string}"', comment))
|
|
|
|
class _SwiftBuilder:
|
|
"""Helper class for emitting binary data as a Swift tuple literal."""
|
|
|
|
entries: list[Tuple[bytes, str]]
|
|
|
|
def __init__(self) -> None:
|
|
self.entries = []
|
|
|
|
def emit_byte(self, x: int, comment: str) -> None:
|
|
self.emit_bytes(_to_byte(x), comment)
|
|
|
|
def emit_uleb(self, x: int, comment: str) -> None:
|
|
self.emit_bytes(_to_uleb(x), comment)
|
|
|
|
def emit_bytes(self, x: bytes, comment: str) -> None:
|
|
self.entries.append((x, comment))
|
|
|
|
def emit_string(self, string: str, comment: str) -> None:
|
|
self.emit_bytes(string.encode(), comment)
|
|
|
|
@property
|
|
def type_decl(self):
|
|
total_bytes = sum((len(bs) for bs, _ in self.entries))
|
|
element_list = ", ".join(["UInt8"] * total_bytes)
|
|
return f"({element_list})"
|
|
|
|
def _build(self, builder) -> None:
|
|
size = len(self._to_binary())
|
|
builder.emit_byte(BINARY_VERSION, "version")
|
|
builder.emit_uleb(size, "remaining record size")
|
|
builder.emit_uleb(len(self.type_name), "type name size")
|
|
builder.emit_string(self.type_name, "type name")
|
|
builder.emit_byte(self.flags, "flags")
|
|
for sig, bc in self.signatures:
|
|
builder.emit_byte(SIGNATURES[sig], f"sig_{sig}")
|
|
builder.emit_uleb(len(bc), "program size")
|
|
builder.emit_bytes(bc, "program")
|
|
|
|
@property
|
|
def _var_name(self):
|
|
var_name = re.sub(r"\W", "_", self.type_name)
|
|
return f"_{var_name}_formatter"
|
|
|
|
def write_c(self, output: TextIO) -> None:
|
|
self.validate()
|
|
|
|
builder = self._CBuilder()
|
|
self._build(builder)
|
|
|
|
print(
|
|
textwrap.dedent(
|
|
"""
|
|
#ifdef __APPLE__
|
|
#define FORMATTER_SECTION "__DATA_CONST,__lldbformatters"
|
|
#else
|
|
#define FORMATTER_SECTION ".lldbformatters"
|
|
#endif
|
|
"""
|
|
),
|
|
file=output,
|
|
)
|
|
print(
|
|
"__attribute__((used, section(FORMATTER_SECTION)))",
|
|
file=output,
|
|
)
|
|
print(f"unsigned char {self._var_name}[] =", file=output)
|
|
indent = " "
|
|
for string, comment in builder.entries:
|
|
print(f"{indent}// {comment}", file=output)
|
|
print(f"{indent}{string}", file=output)
|
|
print(";", file=output)
|
|
|
|
def write_swift(self, output: TextIO) -> None:
|
|
self.validate()
|
|
|
|
builder = self._SwiftBuilder()
|
|
self._build(builder)
|
|
|
|
print(
|
|
textwrap.dedent(
|
|
"""\
|
|
#if swift(>=6.3)
|
|
#if os(macOS) || os(iOS) || os(watchOS) || os(tvOS) || os(visionOS)
|
|
@section("__DATA_CONST,__lldbformatters")
|
|
#else
|
|
@section(".lldbformatters")
|
|
#endif
|
|
@used"""
|
|
),
|
|
file=output,
|
|
)
|
|
print(
|
|
f"let {self._var_name}: {builder.type_decl} = (",
|
|
file=output,
|
|
)
|
|
indent = " "
|
|
for bs, comment in builder.entries:
|
|
print(f"{indent}// {comment}", file=output)
|
|
byte_list = ", ".join(f"0x{b:02x}" for b in bs)
|
|
print(f"{indent}{byte_list},", file=output)
|
|
print(")", file=output)
|
|
print("#endif", file=output) # swift(>=6.3)
|
|
|
|
|
|
def assemble_file(type_name: str, input: TextIO) -> BytecodeSection:
|
|
input_tokens = _tokenize(input.read())
|
|
signatures = []
|
|
for sig, tokens in _segment_by_signature(input_tokens):
|
|
if tokens:
|
|
signatures.append((sig, assemble_tokens(tokens)))
|
|
|
|
return BytecodeSection(type_name, flags=0, signatures=signatures)
|
|
|
|
|
|
def assemble(assembly: str) -> bytes:
|
|
return assemble_tokens(_tokenize(assembly))
|
|
|
|
|
|
def assemble_tokens(tokens: list[str]) -> bytes:
|
|
"""Assemble assembly into bytecode"""
|
|
# This is a stack of all in-flight/unterminated blocks.
|
|
bytecode = [bytearray()]
|
|
|
|
def emit(byte):
|
|
bytecode[-1].append(byte)
|
|
|
|
tokens.reverse()
|
|
while tokens:
|
|
tok = tokens.pop()
|
|
if tok == "":
|
|
pass
|
|
elif tok == "{":
|
|
bytecode.append(bytearray())
|
|
elif tok == "}":
|
|
block = bytecode.pop()
|
|
emit(op_begin)
|
|
emit(len(block)) # FIXME: uleb
|
|
bytecode[-1].extend(block)
|
|
elif tok[0].isdigit():
|
|
if tok[-1] == "u":
|
|
emit(op_lit_uint)
|
|
emit(int(tok[:-1])) # FIXME
|
|
else:
|
|
emit(op_lit_int)
|
|
emit(int(tok)) # FIXME
|
|
elif tok[0] == "@":
|
|
emit(op_lit_selector)
|
|
emit(selector[tok])
|
|
elif tok[0] == '"':
|
|
# Remove backslash escaping '"' and '\'.
|
|
s = re.sub(r'\\(["\\])', r"\1", tok[1:-1]).encode()
|
|
emit(op_lit_string)
|
|
emit(len(s))
|
|
bytecode[-1].extend(s)
|
|
else:
|
|
emit(opcode[tok])
|
|
assert len(bytecode) == 1 # unterminated {
|
|
return bytes(bytecode[0])
|
|
|
|
|
|
################################################################################
|
|
# Disassembler.
|
|
################################################################################
|
|
|
|
|
|
def disassemble_file(input: BinaryIO, output: TextIO) -> None:
|
|
stream = io.BytesIO(input.read())
|
|
|
|
version = stream.read(1)[0]
|
|
if version != BINARY_VERSION:
|
|
raise ValueError(f"unknown binary version: {version}")
|
|
|
|
record_size = _from_uleb(stream)
|
|
stream.truncate(stream.tell() + record_size)
|
|
|
|
name_size = _from_uleb(stream)
|
|
_type_name = stream.read(name_size).decode()
|
|
_flags = stream.read(1)[0]
|
|
|
|
while True:
|
|
sig_byte = stream.read(1)
|
|
if not sig_byte:
|
|
break
|
|
sig_name = SIGNATURE_IDS[sig_byte[0]]
|
|
body_size = _from_uleb(stream)
|
|
bc = stream.read(body_size)
|
|
asm, _ = disassemble(bc)
|
|
print(f"@{sig_name}: {asm}", file=output)
|
|
|
|
|
|
def disassemble(bytecode: bytes) -> Tuple[str, list[int]]:
|
|
"""Disassemble bytecode into (assembly, token starts)"""
|
|
asm = ""
|
|
all_bytes = list(bytecode)
|
|
all_bytes.reverse()
|
|
blocks = []
|
|
tokens = [0]
|
|
|
|
def next_byte():
|
|
"""Fetch the next byte in the bytecode and keep track of all
|
|
in-flight blocks"""
|
|
for i in range(len(blocks)):
|
|
blocks[i] -= 1
|
|
tokens.append(len(asm))
|
|
return all_bytes.pop()
|
|
|
|
while all_bytes:
|
|
b = next_byte()
|
|
if b == op_begin:
|
|
asm += "{"
|
|
length = next_byte()
|
|
blocks.append(length)
|
|
elif b == op_lit_uint:
|
|
b = next_byte()
|
|
asm += str(b) # FIXME uleb
|
|
asm += "u"
|
|
elif b == op_lit_int:
|
|
b = next_byte()
|
|
asm += str(b)
|
|
elif b == op_lit_selector:
|
|
b = next_byte()
|
|
asm += selector[b]
|
|
elif b == op_lit_string:
|
|
length = next_byte()
|
|
s = '"'
|
|
for _ in range(length):
|
|
c = chr(next_byte())
|
|
if c in ('"', "\\"):
|
|
s += "\\"
|
|
s += c
|
|
s += '"'
|
|
asm += s
|
|
else:
|
|
asm += opcode[b]
|
|
|
|
while blocks and blocks[-1] == 0:
|
|
asm += " }"
|
|
blocks.pop()
|
|
|
|
if all_bytes:
|
|
asm += " "
|
|
|
|
if blocks:
|
|
asm += "ERROR"
|
|
return asm, tokens
|
|
|
|
|
|
################################################################################
|
|
# Interpreter.
|
|
################################################################################
|
|
|
|
|
|
def count_fmt_params(fmt: str) -> int:
|
|
"""Count the number of parameters in a format string"""
|
|
from string import Formatter
|
|
|
|
f = Formatter()
|
|
n = 0
|
|
for _, name, _, _ in f.parse(fmt):
|
|
if name > n:
|
|
n = name
|
|
return n
|
|
|
|
|
|
def interpret(bytecode: bytes, control: list, data: list, tracing: bool = False):
|
|
"""Interpret bytecode"""
|
|
frame = []
|
|
frame.append((0, len(bytecode)))
|
|
|
|
def trace():
|
|
"""print a trace of the execution for debugging purposes"""
|
|
|
|
def fmt(d):
|
|
if isinstance(d, int):
|
|
return str(d)
|
|
if isinstance(d, str):
|
|
return d
|
|
return repr(type(d))
|
|
|
|
pc, end = frame[-1]
|
|
asm, tokens = disassemble(bytecode)
|
|
print(
|
|
"=== frame = {1}, data = {2}, opcode = {0}".format(
|
|
opcode[b], frame, [fmt(d) for d in data]
|
|
)
|
|
)
|
|
print(asm)
|
|
print(" " * (tokens[pc]) + "^")
|
|
|
|
def next_byte():
|
|
"""Fetch the next byte and update the PC"""
|
|
pc, end = frame[-1]
|
|
assert pc < len(bytecode)
|
|
b = bytecode[pc]
|
|
frame[-1] = pc + 1, end
|
|
# At the end of a block?
|
|
while pc >= end:
|
|
frame.pop()
|
|
if not frame:
|
|
return None
|
|
pc, end = frame[-1]
|
|
if pc >= end:
|
|
return None
|
|
b = bytecode[pc]
|
|
frame[-1] = pc + 1, end
|
|
return b
|
|
|
|
while frame[-1][0] < len(bytecode):
|
|
b = next_byte()
|
|
if b == None:
|
|
break
|
|
if tracing:
|
|
trace()
|
|
# Data stack manipulation.
|
|
if b == op_dup:
|
|
data.append(data[-1])
|
|
elif b == op_drop:
|
|
data.pop()
|
|
elif b == op_pick:
|
|
data.append(data[data.pop()])
|
|
elif b == op_over:
|
|
data.append(data[-2])
|
|
elif b == op_swap:
|
|
x = data.pop()
|
|
y = data.pop()
|
|
data.append(x)
|
|
data.append(y)
|
|
elif b == op_rot:
|
|
z = data.pop()
|
|
y = data.pop()
|
|
x = data.pop()
|
|
data.append(z)
|
|
data.append(x)
|
|
data.append(y)
|
|
|
|
# Control stack manipulation.
|
|
elif b == op_begin:
|
|
length = next_byte()
|
|
pc, end = frame[-1]
|
|
control.append((pc, pc + length))
|
|
frame[-1] = pc + length, end
|
|
elif b == op_if:
|
|
if data.pop():
|
|
frame.append(control.pop())
|
|
elif b == op_ifelse:
|
|
if data.pop():
|
|
control.pop()
|
|
frame.append(control.pop())
|
|
else:
|
|
frame.append(control.pop())
|
|
control.pop()
|
|
elif b == op_return:
|
|
control.clear()
|
|
return data[-1]
|
|
|
|
# Literals.
|
|
elif b == op_lit_uint:
|
|
b = next_byte() # FIXME uleb
|
|
data.append(int(b))
|
|
elif b == op_lit_int:
|
|
b = next_byte() # FIXME uleb
|
|
data.append(int(b))
|
|
elif b == op_lit_selector:
|
|
b = next_byte()
|
|
data.append(b)
|
|
elif b == op_lit_string:
|
|
length = next_byte()
|
|
s = ""
|
|
while length:
|
|
s += chr(next_byte())
|
|
length -= 1
|
|
data.append(s)
|
|
|
|
elif b == op_as_uint:
|
|
pass
|
|
elif b == op_as_int:
|
|
pass
|
|
elif b == op_is_null:
|
|
data.append(1 if data.pop() == None else 0)
|
|
|
|
# Arithmetic, logic, etc.
|
|
elif b == op_plus:
|
|
data.append(data.pop() + data.pop())
|
|
elif b == op_minus:
|
|
data.append(-data.pop() + data.pop())
|
|
elif b == op_mul:
|
|
data.append(data.pop() * data.pop())
|
|
elif b == op_div:
|
|
y = data.pop()
|
|
data.append(data.pop() / y)
|
|
elif b == op_mod:
|
|
y = data.pop()
|
|
data.append(data.pop() % y)
|
|
elif b == op_shl:
|
|
y = data.pop()
|
|
data.append(data.pop() << y)
|
|
elif b == op_shr:
|
|
y = data.pop()
|
|
data.append(data.pop() >> y)
|
|
elif b == op_and:
|
|
data.append(data.pop() & data.pop())
|
|
elif b == op_or:
|
|
data.append(data.pop() | data.pop())
|
|
elif b == op_xor:
|
|
data.append(data.pop() ^ data.pop())
|
|
elif b == op_not:
|
|
data.append(not data.pop())
|
|
elif b == op_eq:
|
|
data.append(data.pop() == data.pop())
|
|
elif b == op_neq:
|
|
data.append(data.pop() != data.pop())
|
|
elif b == op_lt:
|
|
data.append(data.pop() > data.pop())
|
|
elif b == op_gt:
|
|
data.append(data.pop() < data.pop())
|
|
elif b == op_le:
|
|
data.append(data.pop() >= data.pop())
|
|
elif b == op_ge:
|
|
data.append(data.pop() <= data.pop())
|
|
|
|
# Function calls.
|
|
elif b == op_call:
|
|
sel = data.pop()
|
|
if sel == sel_summary:
|
|
data.append(data.pop().GetSummary())
|
|
elif sel == sel_get_num_children:
|
|
data.append(data.pop().GetNumChildren())
|
|
elif sel == sel_get_child_at_index:
|
|
index = data.pop()
|
|
valobj = data.pop()
|
|
data.append(valobj.GetChildAtIndex(index))
|
|
elif sel == sel_get_child_with_name:
|
|
name = data.pop()
|
|
valobj = data.pop()
|
|
data.append(valobj.GetChildMemberWithName(name))
|
|
elif sel == sel_get_child_index:
|
|
name = data.pop()
|
|
valobj = data.pop()
|
|
data.append(valobj.GetIndexOfChildWithName(name))
|
|
elif sel == sel_get_type:
|
|
data.append(data.pop().GetType())
|
|
elif sel == sel_get_template_argument_type:
|
|
n = data.pop()
|
|
valobj = data.pop()
|
|
data.append(valobj.GetTemplateArgumentType(n))
|
|
elif sel == sel_get_synthetic_value:
|
|
data.append(data.pop().GetSyntheticValue())
|
|
elif sel == sel_get_non_synthetic_value:
|
|
data.append(data.pop().GetNonSyntheticValue())
|
|
elif sel == sel_get_value:
|
|
data.append(data.pop().GetValue())
|
|
elif sel == sel_get_value_as_unsigned:
|
|
data.append(data.pop().GetValueAsUnsigned())
|
|
elif sel == sel_get_value_as_signed:
|
|
data.append(data.pop().GetValueAsSigned())
|
|
elif sel == sel_get_value_as_address:
|
|
data.append(data.pop().GetValueAsAddress())
|
|
elif sel == sel_cast:
|
|
sbtype = data.pop()
|
|
valobj = data.pop()
|
|
data.append(valobj.Cast(sbtype))
|
|
elif sel == sel_strlen:
|
|
s = data.pop()
|
|
data.append(len(s) if s else 0)
|
|
elif sel == sel_fmt:
|
|
fmt = data.pop()
|
|
n = count_fmt_params(fmt)
|
|
args = []
|
|
for i in range(n):
|
|
args.append(data.pop())
|
|
data.append(fmt.format(*args))
|
|
else:
|
|
print("not implemented: " + selector[sel])
|
|
assert False
|
|
return data[-1]
|
|
|
|
|
|
################################################################################
|
|
# Python -> Bytecode Compiler
|
|
################################################################################
|
|
|
|
_BUILTINS = {
|
|
"Cast": "@cast",
|
|
"GetChildAtIndex": "@get_child_at_index",
|
|
"GetChildMemberWithName": "@get_child_with_name",
|
|
"GetSummary": "@summary",
|
|
"GetSyntheticValue": "@get_synthetic_value",
|
|
"GetTemplateArgumentType": "@get_template_argument_type",
|
|
"GetType": "@get_type",
|
|
"GetValueAsUnsigned": "@get_value_as_unsigned",
|
|
}
|
|
|
|
_COMPS = {
|
|
ast.Eq: "=",
|
|
ast.NotEq: "!=",
|
|
ast.Lt: "<",
|
|
ast.LtE: "=<",
|
|
ast.Gt: ">",
|
|
ast.GtE: "=>",
|
|
}
|
|
|
|
# Maps Python method names in a formatter class to their bytecode signatures.
|
|
_METHOD_SIGS = {
|
|
"__init__": "@init",
|
|
"update": "@update",
|
|
"num_children": "@get_num_children",
|
|
"get_child_index": "@get_child_index",
|
|
"get_child_at_index": "@get_child_at_index",
|
|
"get_value": "@get_value",
|
|
}
|
|
|
|
|
|
class CompilerError(Exception):
|
|
lineno: int
|
|
|
|
def __init__(self, message, node: Union[ast.expr, ast.stmt]) -> None:
|
|
super().__init__(message)
|
|
self.lineno = node.lineno
|
|
|
|
|
|
class Compiler(ast.NodeVisitor):
|
|
"""
|
|
Compile Python LLDB data formatters to LLDB formatter bytecode.
|
|
|
|
This compiler is supports a limited subset of Python.
|
|
|
|
# Supported Features
|
|
|
|
* Top level functions implementing LLDB summary formatters
|
|
* Top level classes implementing LLDB synthetic formatters
|
|
* Partial support for the following, see below for more details:
|
|
- Object attributes (properties)
|
|
- Local variables
|
|
- Function calls
|
|
* Python language support
|
|
[x] If statements (including else, elif and nested if)
|
|
[x] Return statements
|
|
[x] String, integer, float, boolean, and None literals
|
|
[x] Binary comparisons
|
|
[ ] Boolean operators
|
|
[ ] Math operations
|
|
|
|
# Unsupported Features
|
|
|
|
Note: that this is not exhaustive, refer to the list of supported
|
|
features above.
|
|
|
|
* For and while loops
|
|
* Exceptions
|
|
* User defined general purpose functions and classes
|
|
* Lists, dicts, sets, and other container data types
|
|
* Iterators, comprehensions, yield, etc
|
|
* With statements
|
|
* Imports of any modules
|
|
|
|
# Variables
|
|
|
|
The compiler supports two kinds of variables, local variables and attribute
|
|
variables (properties), but there are limitations on both.
|
|
|
|
In __init__ and update, local variables are currently *not* supported, but
|
|
attributes can be assigned to. This matches the common case for these
|
|
functions.
|
|
|
|
In all other function bodies, local variables _are_ supported, but
|
|
attributes can only be read from, *not* assigned to. This also matches the
|
|
common case for these functions.
|
|
|
|
Variables (local and attributes) are tracked, allowing the compiler to know
|
|
their position in the stack. Variable reads can then be lowered to `pick`
|
|
instructions. See the compiler's `locals` and `attrs` attributes.
|
|
|
|
# Functions
|
|
|
|
Known functions are supported, a design that customizes the scope of what
|
|
formatters can and can't do. The functions known to the compiler are called
|
|
"selectors". The selectors are primarily SBValue API, although there are
|
|
also general purpose selectors. Formatters can only call selectors, not user
|
|
defined functions, and not SB methods that have not been defined as a
|
|
selector.
|
|
"""
|
|
|
|
# Names of locals in bottom-to-top stack order. locals[0] is the
|
|
# oldest/deepest; locals[-1] is the most recently pushed.
|
|
locals: list[str]
|
|
|
|
# Names of visible attrs in bottom-to-top stack order. Always holds the
|
|
# full combined frame for the method being compiled: grows incrementally
|
|
# during __init__/update, and is set to the combined list before getter
|
|
# methods are compiled.
|
|
attrs: list[str]
|
|
|
|
# Bytecode signature of the method being compiled, or None for top-level
|
|
# functions.
|
|
current_sig: Optional[str]
|
|
|
|
buffer: io.StringIO
|
|
|
|
def __init__(self) -> None:
|
|
self.locals = []
|
|
self.attrs = []
|
|
self.current_sig = None
|
|
self.buffer = io.StringIO()
|
|
|
|
def compile(self, source_file: str) -> str:
|
|
with open(source_file) as f:
|
|
root = ast.parse(f.read())
|
|
self.visit(root)
|
|
return self.buffer.getvalue()
|
|
|
|
def visit_ClassDef(self, node: ast.ClassDef) -> None:
|
|
# Compile methods in a fixed order so that attrs is fully populated
|
|
# before getter methods are compiled.
|
|
methods = {}
|
|
for item in node.body:
|
|
if isinstance(item, ast.FunctionDef):
|
|
if item.name not in _METHOD_SIGS:
|
|
raise CompilerError(f"unsupported method: {item.name}", item)
|
|
methods[item.name] = item
|
|
|
|
self.attrs = []
|
|
if method := methods.get("__init__"):
|
|
self._compile_method(method)
|
|
# self.attrs now holds init's attrs. update's attrs are appended above
|
|
# them, so after update self.attrs is the combined init+update list.
|
|
if method := methods.get("update"):
|
|
self._compile_method(method)
|
|
|
|
for method_name, method in methods.items():
|
|
if method_name not in ("__init__", "update"):
|
|
self._compile_method(method)
|
|
|
|
def _compile_method(self, node: ast.FunctionDef) -> None:
|
|
self.current_sig = _METHOD_SIGS[node.name]
|
|
|
|
return_type = node.returns.id if isinstance(node.returns, ast.Name) else None
|
|
if node.name == "update" and return_type != "bool":
|
|
raise CompilerError(
|
|
"update must be declared to return bool: def update(self) -> bool:",
|
|
node,
|
|
)
|
|
|
|
# Strip 'self' (and 'internal_dict' for __init__) from the arg list;
|
|
# the remaining args become the initial locals.
|
|
args = copy(node.args.args)
|
|
args.pop(0) # drop 'self'
|
|
if node.name == "__init__":
|
|
args.pop() # drop trailing 'internal_dict'
|
|
|
|
self.locals = [arg.arg for arg in args]
|
|
|
|
# Compile into a temporary buffer so the signature line can be
|
|
# emitted first.
|
|
saved_buffer = self.buffer
|
|
self.buffer = io.StringIO()
|
|
|
|
self._visit_each(node.body)
|
|
|
|
method_output = self.buffer.getvalue()
|
|
self.buffer = saved_buffer
|
|
self._output(f"{self.current_sig}:")
|
|
self._output(method_output)
|
|
|
|
self.locals.clear()
|
|
self.current_sig = None
|
|
|
|
def visit_FunctionDef(self, node: ast.FunctionDef) -> None:
|
|
# Top-level function (not inside a class).
|
|
self.current_sig = None
|
|
self.attrs = []
|
|
self.locals = [arg.arg for arg in node.args.args]
|
|
self._visit_each(node.body)
|
|
self.locals.clear()
|
|
|
|
def visit_Compare(self, node: ast.Compare) -> None:
|
|
self.visit(node.left)
|
|
# XXX: Does not handle multiple comparisons, ex: `0 < x < 10`
|
|
self.visit(node.comparators[0])
|
|
self._output(_COMPS[type(node.ops[0])])
|
|
|
|
def visit_If(self, node: ast.If) -> None:
|
|
self.visit(node.test)
|
|
|
|
self._output("{")
|
|
self._visit_each(node.body)
|
|
if node.orelse:
|
|
self._output("} {")
|
|
self._visit_each(node.orelse)
|
|
self._output("} ifelse")
|
|
else:
|
|
self._output("} if")
|
|
|
|
def visit_Return(self, node: ast.Return) -> None:
|
|
if node.value:
|
|
self.visit(node.value)
|
|
self._output("return")
|
|
|
|
def visit_Constant(self, node: ast.Constant) -> None:
|
|
if isinstance(node.value, str):
|
|
self._output(f'"{node.value}"')
|
|
elif isinstance(node.value, bool):
|
|
self._output(int(node.value))
|
|
else:
|
|
self._output(node.value)
|
|
|
|
def visit_Call(self, node: ast.Call) -> None:
|
|
func = node.func
|
|
if isinstance(func, ast.Attribute):
|
|
receiver = func.value
|
|
method = func.attr
|
|
# self is not a valid call receiver.
|
|
if isinstance(receiver, ast.Name) and receiver.id == "self":
|
|
raise CompilerError(
|
|
"self is not a valid call receiver; use self.attr to read an attribute",
|
|
node,
|
|
)
|
|
if selector := _BUILTINS.get(method):
|
|
self.visit(receiver)
|
|
self._visit_each(node.args)
|
|
self._output(f"{selector} call")
|
|
return
|
|
raise CompilerError(f"unsupported method: {method}", node)
|
|
|
|
if isinstance(func, ast.Name):
|
|
raise CompilerError(f"unsupported function: {func.id}", node)
|
|
|
|
raise CompilerError("unsupported function call expression", node)
|
|
|
|
def visit_Assign(self, node: ast.Assign) -> None:
|
|
target = node.targets[0]
|
|
|
|
# Handle self.attr = expr (attribute assignment).
|
|
if (
|
|
isinstance(target, ast.Attribute)
|
|
and isinstance(target.value, ast.Name)
|
|
and target.value.id == "self"
|
|
):
|
|
if self.current_sig not in ("@init", "@update"):
|
|
raise CompilerError(
|
|
"attribute assignment is only allowed in __init__ and update",
|
|
node,
|
|
)
|
|
|
|
attr = target.attr
|
|
if attr in self.attrs:
|
|
raise CompilerError(f"attribute '{attr}' is already assigned", node)
|
|
|
|
# If the RHS is an argument (the only kind of local permitted in
|
|
# __init__) - then it is already on the stack in place, and no
|
|
# evaluation is needed.
|
|
is_arg = (
|
|
isinstance(node.value, ast.Name)
|
|
and self._local_index(node.value) is not None
|
|
)
|
|
if not is_arg:
|
|
# Evaluate the RHS, leaving its value on the stack.
|
|
self.visit(node.value)
|
|
|
|
# Record the attr.
|
|
self.attrs.append(attr)
|
|
return
|
|
|
|
# Handle local variable assignment.
|
|
if self.current_sig in ("@init", "@update"):
|
|
raise CompilerError(
|
|
"local variable assignment is not allowed in __init__ or update; "
|
|
"use attribute assignment (self.attr = ...) instead",
|
|
node,
|
|
)
|
|
|
|
if isinstance(target, ast.Name):
|
|
names = [target]
|
|
elif isinstance(target, ast.Tuple):
|
|
names = cast(list[ast.Name], target.elts)
|
|
else:
|
|
raise CompilerError("unsupported assignment target", node)
|
|
|
|
# Visit RHS, leaving its value on the stack.
|
|
self.visit(node.value)
|
|
|
|
# Forget any previous bindings of these names.
|
|
# Their values are orphaned on the stack.
|
|
for name in names:
|
|
idx = self._local_index(name)
|
|
if idx is not None:
|
|
self.locals[idx] = ""
|
|
|
|
self.locals.extend(x.id for x in names)
|
|
|
|
def visit_Attribute(self, node: ast.Attribute) -> None:
|
|
# Only self.attr reads are supported here.
|
|
if not (isinstance(node.value, ast.Name) and node.value.id == "self"):
|
|
raise CompilerError(
|
|
"unsupported attribute access (only self.attr is supported)", node
|
|
)
|
|
pick_idx = self._attr_index(node.attr, node)
|
|
self._output(f"{pick_idx}u pick") # "# self.{node.attr}"
|
|
|
|
def visit_Name(self, node: ast.Name) -> None:
|
|
idx = self._local_index(node)
|
|
if idx is None:
|
|
raise CompilerError(f"unknown local variable: {node.id}", node)
|
|
self._output(f"{idx}u pick") # "# {node.id}"
|
|
|
|
def _visit_each(self, nodes: Sequence[ast.AST]) -> None:
|
|
for child in nodes:
|
|
self.visit(child)
|
|
|
|
def _attr_index(self, name: str, node: ast.expr) -> int:
|
|
# self.attrs is always the full visible attr frame, so the index is
|
|
# the direct pick offset with no further adjustment.
|
|
try:
|
|
return self.attrs.index(name)
|
|
except ValueError:
|
|
raise CompilerError(f"unknown attribute: {name}", node)
|
|
|
|
def _local_index(self, name: ast.Name) -> Optional[int]:
|
|
try:
|
|
idx = self.locals.index(name.id)
|
|
# Offset past all attrs.
|
|
return len(self.attrs) + idx
|
|
except ValueError:
|
|
return None
|
|
|
|
def _output(self, x: Any) -> None:
|
|
print(x, file=self.buffer)
|
|
|
|
|
|
################################################################################
|
|
# Helper functions.
|
|
################################################################################
|
|
|
|
|
|
def _to_uleb(value: int) -> bytes:
|
|
"""Encode an integer to ULEB128 bytes."""
|
|
if value < 0:
|
|
raise ValueError(f"negative number cannot be encoded to ULEB128: {value}")
|
|
|
|
result = bytearray()
|
|
while True:
|
|
byte = value & 0x7F
|
|
value >>= 7
|
|
if value != 0:
|
|
byte |= 0x80
|
|
result.append(byte)
|
|
if value == 0:
|
|
break
|
|
|
|
return bytes(result)
|
|
|
|
|
|
def _from_uleb(stream: BinaryIO) -> int:
|
|
"""Decode a ULEB128 integer by reading bytes from the stream."""
|
|
result = 0
|
|
shift = 0
|
|
while True:
|
|
byte = stream.read(1)[0]
|
|
result |= (byte & 0x7F) << shift
|
|
shift += 7
|
|
if not (byte & 0x80):
|
|
break
|
|
|
|
return result
|
|
|
|
|
|
def _to_byte(n: int) -> bytes:
|
|
return n.to_bytes(1, "big")
|
|
|
|
|
|
def _main():
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="""
|
|
Assembler, disassembler, and interpreter for LLDB dataformatter bytecode.
|
|
See https://lldb.llvm.org/resources/formatterbytecode.html for more details.
|
|
"""
|
|
)
|
|
parser.add_argument("input", help="input file")
|
|
mode = parser.add_mutually_exclusive_group()
|
|
mode.add_argument(
|
|
"-c",
|
|
"--compile",
|
|
action="store_true",
|
|
help="compile a Python LLDB data formatter into LLDB formatter bytecode",
|
|
)
|
|
mode.add_argument(
|
|
"-a",
|
|
"--assemble",
|
|
action="store_true",
|
|
help="assemble assembly into bytecode",
|
|
)
|
|
mode.add_argument(
|
|
"-d",
|
|
"--disassemble",
|
|
action="store_true",
|
|
help="disassemble bytecode",
|
|
)
|
|
parser.add_argument("-n", "--type-name", help="source type of formatter")
|
|
parser.add_argument(
|
|
"--skip-invocation-comment",
|
|
action="store_true",
|
|
help="do not print invocation comment in compiled output",
|
|
)
|
|
parser.add_argument(
|
|
"-o",
|
|
"--output",
|
|
help="output file (required for --assemble)",
|
|
)
|
|
parser.add_argument(
|
|
"-f",
|
|
"--format",
|
|
choices=("binary", "c", "swift"),
|
|
default="binary",
|
|
help="output file format",
|
|
)
|
|
parser.add_argument("-t", "--test", action="store_true", help="run unit tests")
|
|
|
|
args = parser.parse_args()
|
|
if args.compile:
|
|
if not args.type_name:
|
|
parser.error("--type-name is required with --compile")
|
|
if not args.output:
|
|
parser.error("--output is required with --compile")
|
|
compiler = Compiler()
|
|
try:
|
|
assembly = compiler.compile(args.input)
|
|
except CompilerError as e:
|
|
print(f"{args.input}:{e.lineno}: {e}", file=sys.stderr)
|
|
return
|
|
|
|
section = assemble_file(args.type_name, io.StringIO(assembly))
|
|
if args.format == "binary":
|
|
with open(args.output, "wb") as output:
|
|
section.write_binary(output)
|
|
else:
|
|
with open(args.output, "w") as output:
|
|
if not args.skip_invocation_comment:
|
|
print("// Generated with:", file=output)
|
|
print("// ", shlex.join(sys.argv), file=output)
|
|
section.write_source(output, language=args.format)
|
|
elif args.assemble:
|
|
if not args.type_name:
|
|
parser.error("--type-name is required with --assemble")
|
|
if not args.output:
|
|
parser.error("--output is required with --assemble")
|
|
with open(args.input) as input:
|
|
section = assemble_file(args.type_name, input)
|
|
if args.format == "binary":
|
|
with open(args.output, "wb") as output:
|
|
section.write_binary(output)
|
|
else:
|
|
with open(args.output, "w") as output:
|
|
section.write_source(output, language=args.format)
|
|
elif args.disassemble:
|
|
if args.output:
|
|
with (
|
|
open(args.input, "rb") as input,
|
|
open(args.output, "w") as output,
|
|
):
|
|
disassemble_file(input, output)
|
|
else:
|
|
with open(args.input, "rb") as input:
|
|
disassemble_file(input, sys.stdout)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if not ("-t" in sys.argv or "--test" in sys.argv):
|
|
_main()
|
|
sys.exit()
|
|
|
|
############################################################################
|
|
# Tests.
|
|
############################################################################
|
|
import unittest
|
|
|
|
class TestAssembler(unittest.TestCase):
|
|
|
|
def test_assemble(self):
|
|
self.assertEqual(assemble("1u dup").hex(), "200101")
|
|
self.assertEqual(assemble('"1u dup"').hex(), "2206317520647570")
|
|
self.assertEqual(assemble("16 < { dup } if").hex(), "21105210010111")
|
|
self.assertEqual(assemble('{ { " } " } }').hex(), "100710052203207d20")
|
|
|
|
def roundtrip(asm):
|
|
self.assertEqual(disassemble(assemble(asm))[0], asm)
|
|
|
|
roundtrip("1u dup")
|
|
roundtrip("16 < { dup } if")
|
|
roundtrip('{ { " } " } }')
|
|
|
|
# String specific checks.
|
|
roundtrip('1u "2u 3u"')
|
|
roundtrip('"a b"')
|
|
roundtrip('"a \\" b"')
|
|
|
|
self.assertEqual(interpret(assemble("1 1 +"), [], []), 2)
|
|
self.assertEqual(interpret(assemble("2 1 1 + *"), [], []), 4)
|
|
self.assertEqual(
|
|
interpret(assemble('2 1 > { "yes" } { "no" } ifelse'), [], []), "yes"
|
|
)
|
|
|
|
def test_assemble_file(self):
|
|
def run_assemble(type_name, asm):
|
|
out = io.BytesIO()
|
|
section = assemble_file(type_name, io.StringIO(asm))
|
|
section.write_binary(out)
|
|
out.seek(0)
|
|
return out
|
|
|
|
def run_disassemble(binary):
|
|
out = io.StringIO()
|
|
disassemble_file(binary, out)
|
|
out.seek(0)
|
|
return out
|
|
|
|
# assemble -> disassemble -> assemble round-trip: binary is identical.
|
|
asm = "@summary: dup @get_value_as_unsigned call return\n@get_num_children: drop 5u return"
|
|
binary1 = run_assemble("MyType", asm)
|
|
dis = run_disassemble(binary1)
|
|
binary2 = run_assemble("MyType", dis.read())
|
|
self.assertEqual(binary1.getvalue(), binary2.getvalue())
|
|
|
|
# disassemble -> assemble -> disassemble round-trip: text is identical.
|
|
dis2 = run_disassemble(binary2)
|
|
self.assertEqual(dis.getvalue(), dis2.getvalue())
|
|
|
|
# disassemble output contains expected signatures.
|
|
self.assertIn("@summary:", dis.getvalue())
|
|
self.assertIn("@get_num_children:", dis.getvalue())
|
|
|
|
# Duplicate signature is an error.
|
|
with self.assertRaises(ValueError):
|
|
run_assemble("MyType", "@summary: 1u return\n@summary: 2u return")
|
|
|
|
def test_write_source(self):
|
|
# Use the Account example from main.cpp as a reference, whose
|
|
# exact byte values are known.
|
|
section = BytecodeSection(
|
|
type_name="Account",
|
|
flags=0,
|
|
signatures=[
|
|
("get_num_children", bytes([0x20, 0x01])),
|
|
("get_child_at_index", bytes([0x02, 0x20, 0x00, 0x23, 0x11, 0x60])),
|
|
],
|
|
)
|
|
out = io.StringIO()
|
|
section.write_source(out, language="c")
|
|
src = out.getvalue()
|
|
|
|
self.assertIn("__attribute__((used, section(FORMATTER_SECTION)))", src)
|
|
self.assertIn("unsigned char _Account_formatter[] =", src)
|
|
self.assertIn('"\\x01"', src) # version
|
|
self.assertIn('"\\x15"', src) # record size (21)
|
|
self.assertIn('"\\x07"', src) # type name size (7)
|
|
self.assertIn('"Account"', src) # type name
|
|
self.assertIn('"\\x00"', src) # flags
|
|
self.assertIn('"\\x02"', src) # sig_get_num_children
|
|
self.assertIn('"\\x20\\x01"', src) # program
|
|
self.assertIn('"\\x04"', src) # sig_get_child_at_index
|
|
self.assertIn('"\\x06"', src) # program size
|
|
self.assertIn('"\\x02\\x20\\x00\\x23\\x11\\x60"', src) # program
|
|
self.assertIn("// version", src)
|
|
self.assertIn("// type name", src)
|
|
self.assertIn("// program", src)
|
|
# Semicolon terminates the array initializer.
|
|
self.assertEqual(src.count(";"), 1)
|
|
|
|
# Non-identifier characters in the type name are replaced with '_'.
|
|
out2 = io.StringIO()
|
|
BytecodeSection("std::vector<int>", 0, []).write_source(out2, language="c")
|
|
self.assertIn("_std__vector_int__formatter[] =", out2.getvalue())
|
|
|
|
unittest.main(argv=[__file__])
|