[HWASan] Improve symbol indexing (#135967)

Previously we would add any ELF that contained a build id regardless
whether the ELF contained symbols or not. This works for Android since
soong will strip the symbols into a new directory. However other
build systems, like BUCK, will write the stripped file in the same
directory as the unstripped file. This would cause the hwasan_symbolize
script sometimes add then stripped ELF to its index and ignore the
symbolized ELF. The logic has now been changed to only add ELFs that
contain symbols to the index. If two symbolized ELFs are encountered
with the same build id, we now exit out with an error.

Fixes #135966

---------

Co-authored-by: Stefan Bossbaly <sboss@meta.com>
This commit is contained in:
Stefan Bossbaly 2025-05-23 12:43:04 -04:00 committed by GitHub
parent 01cb390efd
commit 0cf3c437c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -16,6 +16,7 @@ from __future__ import unicode_literals
import argparse
import glob
import hashlib
import html
import json
import mmap
@ -37,8 +38,9 @@ if sys.version_info.major < 3:
Ehdr_size = 64
e_shnum_offset = 60
e_shoff_offset = 40
e_shstrndx_offset = 62
Shdr_size = 64
sh_name_offset = 0
sh_type_offset = 4
sh_offset_offset = 24
sh_size_offset = 32
@ -62,13 +64,32 @@ def handle_Nhdr(mv, sh_size):
offset += Nhdr_size + align_up(n_namesz, 4) + align_up(n_descsz, 4)
return None
def handle_Shdr(mv):
def handle_shstrtab(mv, e_shoff):
e_shstrndx, = struct.unpack_from('<H', buffer=mv, offset=e_shstrndx_offset)
start_shstrndx = e_shoff + e_shstrndx * Shdr_size
shstrndx_sh = mv[start_shstrndx: start_shstrndx + Shdr_size]
_, shstrndx_sh_offset, shstrndx_sh_size = handle_Shdr(shstrndx_sh)
return mv[shstrndx_sh_offset:shstrndx_sh_offset + shstrndx_sh_size]
def read_string(mv):
name = ""
for byte in mv:
char = chr(byte)
if char == '\x00':
break
name += char
return name
def unpack_sh_type(mv):
sh_type, = struct.unpack_from('<I', buffer=mv, offset=sh_type_offset)
if sh_type != SHT_NOTE:
return None, None
return sh_type
def handle_Shdr(mv):
name_offset, = struct.unpack_from('<I', buffer=mv, offset=sh_name_offset)
sh_offset, = struct.unpack_from('<Q', buffer=mv, offset=sh_offset_offset)
sh_size, = struct.unpack_from('<Q', buffer=mv, offset=sh_size_offset)
return sh_offset, sh_size
return name_offset, sh_offset, sh_size
def handle_elf(mv):
# \x02 is ELFCLASS64, \x01 is ELFDATA2LSB. HWASan currently only works on
@ -76,19 +97,37 @@ def handle_elf(mv):
# have to extend the parsing code.
if mv[:6] != b'\x7fELF\x02\x01':
return None
found_symbols = False
bid = None
e_shnum, = struct.unpack_from('<H', buffer=mv, offset=e_shnum_offset)
e_shoff, = struct.unpack_from('<Q', buffer=mv, offset=e_shoff_offset)
# Section where all the section header names are stored.
shstr = handle_shstrtab(mv, e_shoff)
for i in range(0, e_shnum):
start = e_shoff + i * Shdr_size
sh_offset, sh_size = handle_Shdr(mv[start: start + Shdr_size])
if sh_offset is None:
continue
note_hdr = mv[sh_offset: sh_offset + sh_size]
result = handle_Nhdr(note_hdr, sh_size)
if result is not None:
return result
sh = mv[start: start + Shdr_size]
sh_name_offset, sh_offset, sh_size = handle_Shdr(sh)
sh_name = read_string(shstr[sh_name_offset:])
sh_type = unpack_sh_type(sh)
def get_buildid(filename):
if sh_name == ".debug_info":
found_symbols = True
if sh_type == SHT_NOTE:
if sh_offset is None:
continue
note_hdr = mv[sh_offset: sh_offset + sh_size]
result = handle_Nhdr(note_hdr, sh_size)
if result is not None:
bid = result
if found_symbols:
return bid
else:
return None
def read_elf(filename):
with open(filename, "r") as fd:
if os.fstat(fd.fileno()).st_size < Ehdr_size:
return None
@ -200,7 +239,7 @@ class Symbolizer:
if os.path.exists(full_path):
return full_path
if name not in self.__warnings:
print("Could not find symbols for", name, file=sys.stderr)
print("Could not find symbols for {} (Build ID: {})".format(name, buildid), file=sys.stderr)
self.__warnings.add(name)
return None
@ -268,13 +307,30 @@ class Symbolizer:
for fn in fnames:
filename = os.path.join(dname, fn)
try:
bid = get_buildid(filename)
bid = read_elf(filename)
except FileNotFoundError:
continue
except Exception as e:
print("Failed to parse {}: {}".format(filename, e), file=sys.stderr)
continue
if bid is not None:
if bid is None:
continue
if bid in self.__index:
index_filename = self.__index[bid]
if os.path.samefile(index_filename, filename):
continue
with open(filename, "rb") as f:
file_hash = hashlib.file_digest(f, "sha256")
with open(index_filename, "rb") as f:
index_file_hash = hashlib.file_digest(f, "sha256")
if index_file_hash.digest() != file_hash.digest():
print("Build ID collision! Files share the same BuildId ({}) but their contents differ. Files {} and {} ".format(bid, filename, index_filename), file=sys.stderr)
else:
self.__index[bid] = filename
def symbolize_line(self, line):