llvm-project/llvm/utils/llvm-original-di-preservation.py
Stephen Tozer bc216b057d
[Debugify] Improve reduction of debugify coverage build output (#150212)
In current DebugLoc coverage builds, the output for any reasonably large
build can become very large if any missing DebugLocs are present; this
happens because single errors in LLVM may result in many errors being
reported in the output report. The main cause of this is that the empty
locations attached to instructions may be propagated to other
instructions in later passes, which will each be reported as new errors.
This patch prevents this by adding an "unknown" annotation to
instructions after reporting them once, ensuring that any other
DebugLocs copied or derived from the original empty location will not be
marked as new errors.

As a separate but related change, this patch updates the report
generation script to deduplicate results using the recorded stacktrace
if they are available, instead of the pass+instruction combination. This
reduces the size of the reduction, but makes the reduction highly
reliable, as the stacktrace allows us to very precisely identify when
two bugs have originated from the same place.
2025-08-15 14:01:04 +01:00

712 lines
23 KiB
Python
Executable File

#!/usr/bin/env python
#
# Debugify summary for the original debug info testing.
#
from __future__ import print_function
import argparse
import os
import re
import sys
from json import loads
from collections import defaultdict
from collections import OrderedDict
class DILocBug:
def __init__(self, origin, action, bb_name, fn_name, instr):
self.origin = origin
self.action = action
self.bb_name = bb_name
self.fn_name = fn_name
self.instr = instr
def key(self):
return self.action + self.bb_name + self.fn_name + self.instr
def reduced_key(self, bug_pass):
if self.origin is not None:
# If we have the origin stacktrace available, we can use it to efficiently deduplicate identical errors. We
# just need to remove the pointer values from the string first, so that we can deduplicate across files.
origin_no_addr = re.sub(r"0x[0-9a-fA-F]+", "", self.origin)
return origin_no_addr
return bug_pass + self.instr
def to_dict(self):
result = {
"instr": self.instr,
"fn_name": self.fn_name,
"bb_name": self.bb_name,
"action": self.action,
}
if self.origin:
result["origin"] = self.origin
return result
class DISPBug:
def __init__(self, action, fn_name):
self.action = action
self.fn_name = fn_name
def key(self):
return self.action + self.fn_name
def reduced_key(self, bug_pass):
return bug_pass + self.fn_name
def to_dict(self):
return {
"fn_name": self.fn_name,
"action": self.action,
}
class DIVarBug:
def __init__(self, action, name, fn_name):
self.action = action
self.name = name
self.fn_name = fn_name
def key(self):
return self.action + self.name + self.fn_name
def reduced_key(self, bug_pass):
return bug_pass + self.name
def to_dict(self):
return {
"fn_name": self.fn_name,
"name": self.name,
"action": self.action,
}
def print_bugs_yaml(name, bugs_dict, indent=2):
def get_bug_line(indent_level: int, text: str, margin_mark: bool = False):
if margin_mark:
return "- ".rjust(indent_level * indent) + text
return " " * indent * indent_level + text
print(f"{name}:")
for bugs_file, bugs_pass_dict in sorted(iter(bugs_dict.items())):
print(get_bug_line(1, f"{bugs_file}:"))
for bugs_pass, bugs_list in sorted(iter(bugs_pass_dict.items())):
print(get_bug_line(2, f"{bugs_pass}:"))
for bug in bugs_list:
bug_dict = bug.to_dict()
first_line = True
# First item needs a '-' in the margin.
for key, val in sorted(iter(bug_dict.items())):
if "\n" in val:
# Output block text for any multiline string.
print(get_bug_line(3, f"{key}: |", first_line))
for line in val.splitlines():
print(get_bug_line(4, line))
else:
print(get_bug_line(3, f"{key}: {val}", first_line))
first_line = False
# Report the bugs in form of html.
def generate_html_report(
di_location_bugs,
di_subprogram_bugs,
di_var_bugs,
di_location_bugs_summary,
di_sp_bugs_summary,
di_var_bugs_summary,
html_file,
):
fileout = open(html_file, "w")
html_header = """ <html>
<head>
<style>
table, th, td {
border: 1px solid black;
}
table.center {
margin-left: auto;
margin-right: auto;
}
</style>
</head>
<body>
"""
# Create the table for Location bugs.
table_title_di_loc = "Location Bugs found by the Debugify"
table_di_loc = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_loc
)
# If any DILocation bug has an origin stack trace, we emit an extra column in the table, which we must therefore
# determine up-front.
has_origin_col = any(
x.origin is not None
for per_file_bugs in di_location_bugs.values()
for per_pass_bugs in per_file_bugs.values()
for x in per_pass_bugs
)
header_di_loc = [
"File",
"LLVM Pass Name",
"LLVM IR Instruction",
"Function Name",
"Basic Block Name",
"Action",
]
if has_origin_col:
header_di_loc.append("Origin")
for column in header_di_loc:
table_di_loc += " <th>{0}</th>\n".format(column.strip())
table_di_loc += " </tr>\n"
at_least_one_bug_found = False
# Handle loction bugs.
for file, per_file_bugs in di_location_bugs.items():
for llvm_pass, per_pass_bugs in per_file_bugs.items():
# No location bugs for the pass.
if len(per_pass_bugs) == 0:
continue
at_least_one_bug_found = True
row = []
table_di_loc += " </tr>\n"
# Get the bugs info.
for x in per_pass_bugs:
row.append(" <tr>\n")
row.append(file)
row.append(llvm_pass)
row.append(x.instr)
row.append(x.fn_name)
row.append(x.bb_name)
row.append(x.action)
if has_origin_col:
if x.origin is not None:
row.append(
f"<details><summary>View Origin StackTrace</summary><pre>{x.origin}</pre></details>"
)
else:
row.append("")
row.append(" </tr>\n")
# Dump the bugs info into the table.
for column in row:
# The same file-pass pair can have multiple bugs.
if column == " <tr>\n" or column == " </tr>\n":
table_di_loc += column
continue
table_di_loc += " <td>{0}</td>\n".format(column.strip())
table_di_loc += " <tr>\n"
if not at_least_one_bug_found:
table_di_loc += """ <tr>
<td colspan='7'> No bugs found </td>
</tr>
"""
table_di_loc += "</table>\n"
# Create the summary table for the loc bugs.
table_title_di_loc_sum = "Summary of Location Bugs"
table_di_loc_sum = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_loc_sum
)
header_di_loc_sum = ["LLVM Pass Name", "Number of bugs"]
for column in header_di_loc_sum:
table_di_loc_sum += " <th>{0}</th>\n".format(column.strip())
table_di_loc_sum += " </tr>\n"
# Print the summary.
row = []
for llvm_pass, num in sorted(di_location_bugs_summary.items()):
row.append(" <tr>\n")
row.append(llvm_pass)
row.append(str(num))
row.append(" </tr>\n")
for column in row:
if column == " <tr>\n" or column == " </tr>\n":
table_di_loc_sum += column
continue
table_di_loc_sum += " <td>{0}</td>\n".format(column.strip())
table_di_loc_sum += " <tr>\n"
if not at_least_one_bug_found:
table_di_loc_sum += """<tr>
<td colspan='2'> No bugs found </td>
</tr>
"""
table_di_loc_sum += "</table>\n"
# Create the table for SP bugs.
table_title_di_sp = "SP Bugs found by the Debugify"
table_di_sp = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_sp
)
header_di_sp = ["File", "LLVM Pass Name", "Function Name", "Action"]
for column in header_di_sp:
table_di_sp += " <th>{0}</th>\n".format(column.strip())
table_di_sp += " </tr>\n"
at_least_one_bug_found = False
# Handle fn bugs.
for file, per_file_bugs in di_subprogram_bugs.items():
for llvm_pass, per_pass_bugs in per_file_bugs.items():
# No SP bugs for the pass.
if len(per_pass_bugs) == 0:
continue
at_least_one_bug_found = True
row = []
table_di_sp += " </tr>\n"
# Get the bugs info.
for x in per_pass_bugs:
row.append(" <tr>\n")
row.append(file)
row.append(llvm_pass)
row.append(x.fn_name)
row.append(x.action)
row.append(" </tr>\n")
# Dump the bugs info into the table.
for column in row:
# The same file-pass pair can have multiple bugs.
if column == " <tr>\n" or column == " </tr>\n":
table_di_sp += column
continue
table_di_sp += " <td>{0}</td>\n".format(column.strip())
table_di_sp += " <tr>\n"
if not at_least_one_bug_found:
table_di_sp += """<tr>
<td colspan='4'> No bugs found </td>
</tr>
"""
table_di_sp += "</table>\n"
# Create the summary table for the sp bugs.
table_title_di_sp_sum = "Summary of SP Bugs"
table_di_sp_sum = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_sp_sum
)
header_di_sp_sum = ["LLVM Pass Name", "Number of bugs"]
for column in header_di_sp_sum:
table_di_sp_sum += " <th>{0}</th>\n".format(column.strip())
table_di_sp_sum += " </tr>\n"
# Print the summary.
row = []
for llvm_pass, num in sorted(di_sp_bugs_summary.items()):
row.append(" <tr>\n")
row.append(llvm_pass)
row.append(str(num))
row.append(" </tr>\n")
for column in row:
if column == " <tr>\n" or column == " </tr>\n":
table_di_sp_sum += column
continue
table_di_sp_sum += " <td>{0}</td>\n".format(column.strip())
table_di_sp_sum += " <tr>\n"
if not at_least_one_bug_found:
table_di_sp_sum += """<tr>
<td colspan='2'> No bugs found </td>
</tr>
"""
table_di_sp_sum += "</table>\n"
# Create the table for Variable bugs.
table_title_di_var = "Variable Location Bugs found by the Debugify"
table_di_var = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_var
)
header_di_var = ["File", "LLVM Pass Name", "Variable", "Function", "Action"]
for column in header_di_var:
table_di_var += " <th>{0}</th>\n".format(column.strip())
table_di_var += " </tr>\n"
at_least_one_bug_found = False
# Handle var bugs.
for file, per_file_bugs in di_var_bugs.items():
for llvm_pass, per_pass_bugs in per_file_bugs.items():
# No SP bugs for the pass.
if len(per_pass_bugs) == 0:
continue
at_least_one_bug_found = True
row = []
table_di_var += " </tr>\n"
# Get the bugs info.
for x in per_pass_bugs:
row.append(" <tr>\n")
row.append(file)
row.append(llvm_pass)
row.append(x.name)
row.append(x.fn_name)
row.append(x.action)
row.append(" </tr>\n")
# Dump the bugs info into the table.
for column in row:
# The same file-pass pair can have multiple bugs.
if column == " <tr>\n" or column == " </tr>\n":
table_di_var += column
continue
table_di_var += " <td>{0}</td>\n".format(column.strip())
table_di_var += " <tr>\n"
if not at_least_one_bug_found:
table_di_var += """<tr>
<td colspan='4'> No bugs found </td>
</tr>
"""
table_di_var += "</table>\n"
# Create the summary table for the sp bugs.
table_title_di_var_sum = "Summary of Variable Location Bugs"
table_di_var_sum = """<table>
<caption><b>{}</b></caption>
<tr>
""".format(
table_title_di_var_sum
)
header_di_var_sum = ["LLVM Pass Name", "Number of bugs"]
for column in header_di_var_sum:
table_di_var_sum += " <th>{0}</th>\n".format(column.strip())
table_di_var_sum += " </tr>\n"
# Print the summary.
row = []
for llvm_pass, num in sorted(di_var_bugs_summary.items()):
row.append(" <tr>\n")
row.append(llvm_pass)
row.append(str(num))
row.append(" </tr>\n")
for column in row:
if column == " <tr>\n" or column == " </tr>\n":
table_di_var_sum += column
continue
table_di_var_sum += " <td>{0}</td>\n".format(column.strip())
table_di_var_sum += " <tr>\n"
if not at_least_one_bug_found:
table_di_var_sum += """<tr>
<td colspan='2'> No bugs found </td>
</tr>
"""
table_di_var_sum += "</table>\n"
# Finish the html page.
html_footer = """</body>
</html>"""
new_line = "<br>\n"
fileout.writelines(html_header)
fileout.writelines(table_di_loc)
fileout.writelines(new_line)
fileout.writelines(table_di_loc_sum)
fileout.writelines(new_line)
fileout.writelines(new_line)
fileout.writelines(table_di_sp)
fileout.writelines(new_line)
fileout.writelines(table_di_sp_sum)
fileout.writelines(new_line)
fileout.writelines(new_line)
fileout.writelines(table_di_var)
fileout.writelines(new_line)
fileout.writelines(table_di_var_sum)
fileout.writelines(html_footer)
fileout.close()
print("The " + html_file + " generated.")
# Read the JSON file in chunks.
def get_json_chunk(file, start, size):
json_parsed = None
di_checker_data = []
skipped_lines = 0
line = 0
# The file contains json object per line.
# An example of the line (formatted json):
# {
# "file": "simple.c",
# "pass": "Deduce function attributes in RPO",
# "bugs": [
# [
# {
# "action": "drop",
# "metadata": "DISubprogram",
# "name": "fn2"
# },
# {
# "action": "drop",
# "metadata": "DISubprogram",
# "name": "fn1"
# }
# ]
# ]
# }
with open(file) as json_objects_file:
for json_object_line in json_objects_file:
line += 1
if line < start:
continue
if line >= start + size:
break
try:
json_object = loads(json_object_line)
except:
skipped_lines += 1
else:
di_checker_data.append(json_object)
return (di_checker_data, skipped_lines, line)
# Parse the program arguments.
def parse_program_args(parser):
parser.add_argument("file_name", type=str, help="json file to process")
parser.add_argument(
"--reduce",
action="store_true",
help="create reduced report by deduplicating bugs within and across files",
)
report_type_group = parser.add_mutually_exclusive_group(required=True)
report_type_group.add_argument(
"--report-html-file", type=str, help="output HTML file for the generated report"
)
report_type_group.add_argument(
"--acceptance-test",
action="store_true",
help="if set, produce terminal-friendly output and return 0 iff the input file is empty or does not exist",
)
return parser.parse_args()
def Main():
parser = argparse.ArgumentParser()
opts = parse_program_args(parser)
if opts.report_html_file is not None and not opts.report_html_file.endswith(
".html"
):
print("error: The output file must be '.html'.")
sys.exit(1)
if opts.acceptance_test:
if os.path.isdir(opts.file_name):
print(f"error: Directory passed as input file: '{opts.file_name}'")
sys.exit(1)
if not os.path.exists(opts.file_name):
# We treat an empty input file as a success, as debugify will generate an output file iff any errors are
# found, meaning we expect 0 errors to mean that the expected file does not exist.
print(f"No errors detected for: {opts.file_name}")
sys.exit(0)
# Use the defaultdict in order to make multidim dicts.
di_location_bugs = defaultdict(lambda: defaultdict(list))
di_subprogram_bugs = defaultdict(lambda: defaultdict(list))
di_variable_bugs = defaultdict(lambda: defaultdict(list))
# Use the ordered dict to make a summary.
di_location_bugs_summary = OrderedDict()
di_sp_bugs_summary = OrderedDict()
di_var_bugs_summary = OrderedDict()
# If we are using --reduce, use these sets to deduplicate similar bugs within and across files.
di_loc_reduced_set = set()
di_sp_reduced_set = set()
di_var_reduced_set = set()
start_line = 0
chunk_size = 1000000
end_line = chunk_size - 1
skipped_lines = 0
skipped_bugs = 0
# Process each chunk of 1 million JSON lines.
while True:
if start_line > end_line:
break
(debug_info_bugs, skipped, end_line) = get_json_chunk(
opts.file_name, start_line, chunk_size
)
start_line += chunk_size
skipped_lines += skipped
# Map the bugs into the file-pass pairs.
for bugs_per_pass in debug_info_bugs:
try:
bugs_file = bugs_per_pass["file"]
bugs_pass = bugs_per_pass["pass"]
bugs = bugs_per_pass["bugs"][0]
except:
skipped_lines += 1
continue
di_loc_bugs = di_location_bugs.get("bugs_file", {}).get("bugs_pass", [])
di_sp_bugs = di_subprogram_bugs.get("bugs_file", {}).get("bugs_pass", [])
di_var_bugs = di_variable_bugs.get("bugs_file", {}).get("bugs_pass", [])
# Omit duplicated bugs.
di_loc_set = set()
di_sp_set = set()
di_var_set = set()
for bug in bugs:
try:
bugs_metadata = bug["metadata"]
except:
skipped_bugs += 1
continue
if bugs_metadata == "DILocation":
try:
origin = bug.get("origin")
action = bug["action"]
bb_name = bug["bb-name"]
fn_name = bug["fn-name"]
instr = bug["instr"]
except:
skipped_bugs += 1
continue
di_loc_bug = DILocBug(origin, action, bb_name, fn_name, instr)
if not di_loc_bug.key() in di_loc_set:
di_loc_set.add(di_loc_bug.key())
if opts.reduce:
reduced_key = di_loc_bug.reduced_key(bugs_pass)
if not reduced_key in di_loc_reduced_set:
di_loc_reduced_set.add(reduced_key)
di_loc_bugs.append(di_loc_bug)
else:
di_loc_bugs.append(di_loc_bug)
# Fill the summary dict.
if bugs_pass in di_location_bugs_summary:
di_location_bugs_summary[bugs_pass] += 1
else:
di_location_bugs_summary[bugs_pass] = 1
elif bugs_metadata == "DISubprogram":
try:
action = bug["action"]
name = bug["name"]
except:
skipped_bugs += 1
continue
di_sp_bug = DISPBug(action, name)
if not di_sp_bug.key() in di_sp_set:
di_sp_set.add(di_sp_bug.key())
if opts.reduce:
reduced_key = di_sp_bug.reduced_key(bugs_pass)
if not reduced_key in di_sp_reduced_set:
di_sp_reduced_set.add(reduced_key)
di_sp_bugs.append(di_sp_bug)
else:
di_sp_bugs.append(di_sp_bug)
# Fill the summary dict.
if bugs_pass in di_sp_bugs_summary:
di_sp_bugs_summary[bugs_pass] += 1
else:
di_sp_bugs_summary[bugs_pass] = 1
elif bugs_metadata == "dbg-var-intrinsic":
try:
action = bug["action"]
fn_name = bug["fn-name"]
name = bug["name"]
except:
skipped_bugs += 1
continue
di_var_bug = DIVarBug(action, name, fn_name)
if not di_var_bug.key() in di_var_set:
di_var_set.add(di_var_bug.key())
if opts.reduce:
reduced_key = di_var_bug.reduced_key(bugs_pass)
if not reduced_key in di_var_reduced_set:
di_var_reduced_set.add(reduced_key)
di_var_bugs.append(di_var_bug)
else:
di_var_bugs.append(di_var_bug)
# Fill the summary dict.
if bugs_pass in di_var_bugs_summary:
di_var_bugs_summary[bugs_pass] += 1
else:
di_var_bugs_summary[bugs_pass] = 1
else:
# Unsupported metadata.
skipped_bugs += 1
continue
if di_loc_bugs:
di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs
if di_sp_bugs:
di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs
if di_var_bugs:
di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs
if opts.report_html_file is not None:
generate_html_report(
di_location_bugs,
di_subprogram_bugs,
di_variable_bugs,
di_location_bugs_summary,
di_sp_bugs_summary,
di_var_bugs_summary,
opts.report_html_file,
)
else:
# Pretty(ish) print the detected bugs, but check if any exist first so that we don't print an empty dict.
if di_location_bugs:
print_bugs_yaml("DILocation Bugs", di_location_bugs)
if di_subprogram_bugs:
print_bugs_yaml("DISubprogram Bugs", di_subprogram_bugs)
if di_variable_bugs:
print_bugs_yaml("DIVariable Bugs", di_variable_bugs)
if opts.acceptance_test:
if any((di_location_bugs, di_subprogram_bugs, di_variable_bugs)):
# Add a newline gap after printing at least one error.
print()
print(f"Errors detected for: {opts.file_name}")
sys.exit(1)
else:
print(f"No errors detected for: {opts.file_name}")
if skipped_lines > 0:
print("Skipped lines: " + str(skipped_lines))
if skipped_bugs > 0:
print("Skipped bugs: " + str(skipped_bugs))
if __name__ == "__main__":
Main()
sys.exit(0)