elf_symbols/main.py

#!/bin/python

# ============================================================
# elf_symbols - dwarf symbols browser
# Copyright (C) 2023-2025 Juraj Oravec <jurajoravec@mailo.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
# ============================================================

import os
import sys
import argparse
import math
from dwex import formats
from elftools.dwarf.locationlists import LocationParser, LocationExpr
from elftools.dwarf.dwarf_expr import DWARFExprParser, DWARFExprOp, DW_OP_opcode2name
from dwex.dwarfone import DWARFExprParserV1
from pprint import pprint


SCRIPT_VERSION = '0.3.0'
configuration = {
    'include_file_name': False,
    'print_debug_info': False,
    'all_members': False,
    'no_array_expand': False,
    'array_element_limit': 0xffffffffffffffff,
    'address_normalization': True,
    'display_mangled_names': True,
}

supported_types = [
    'DW_TAG_base_type',
    'DW_TAG_structure_type',
    'DW_TAG_array_type',
    'DW_TAG_union_type',
    'DW_TAG_enumeration_type',
    'DW_TAG_pointer_type',
]


def eprint(*args, **kwargs):
    if configuration['print_debug_info']:
        print(*args, file=sys.stderr, **kwargs)


def epprint(*args, **kwargs):
    if configuration['print_debug_info']:
        pprint(*args, stream=sys.stderr, **kwargs)


class DWARFParseError(Exception):
    """ "Opened, could not parse" """
    def __init__(self, exc, di):
        Exception.__init__(self, "DWARF parsing error: " + format(exc))
        self.dwarfinfo = di


# Some additional data for every DIE
def decorate_die(die, i):
    die._i = i
    die._children = None
    return die


def load_children(parent_die):
    # Load and cache child DIEs in the parent DIE, if necessary
    # Assumes the check if the DIE has children has been already performed
    if not hasattr(parent_die, "_children") or parent_die._children is None:
        # TODO: wait cursor here. It may cause disk I/O
        try:
            parent_die._children = [decorate_die(die, i) for (i, die) in enumerate(parent_die.iter_children())]
        except KeyError as ke:
            # Catching #1516 from original project "DWARF Explorer" https://github.com/sevaa/dwex/
            print("This executable file is corrupt or incompatible.")
            parent_die._children = []


def safe_DIE_name(die, default=''):
    name = ''

    if 'DW_AT_name' in die.attributes:
        name = die.attributes['DW_AT_name'].value.decode('utf-8', errors='ignore')
    elif configuration['print_debug_info']:
        name = die.tag
    else:
        name = default

    if configuration['display_mangled_names'] and 'DW_AT_linkage_name' in die.attributes:
        name = '{short_name} - {mangled_name}'.format(
            short_name=name,
            mangled_name=die.attributes['DW_AT_linkage_name'].value.decode('utf-8', errors='ignore')
        )

    return name


class Bear():
    def __init__(self, filename):
        di = formats.read_dwarf(filename, self.resolve_arch)
        if not di:  # Covers both False and None
            print("Something went wrong")
            exit(1)

        # Some degree of graceful handling of wrong format
        try:
            # Some cached top level stuff
            # Notably, iter_CUs doesn't cache
            di._ranges = None   # Loaded on first use

            def decorate_cu(cu, i):
                cu._i = i
                cu._lineprogram = None
                cu._exprparser = None
                return cu
            # We'll need them first thing, might as well load here
            di._unsorted_CUs = [decorate_cu(cu, i) for (i, cu) in enumerate(di.iter_CUs())]
            if not len(di._unsorted_CUs):
                return None  # Weird, but saw it once - debug sections present, but no CUs
            # For quick CU search by offset within the info section, regardless of sorting
            di._CU_offsets = [cu.cu_offset for cu in di._unsorted_CUs]
            di._CUs = list(di._unsorted_CUs)

            di._locparser = None  # Created on first use

            self.dwarfinfo = di
            self.filename = filename
        except AssertionError as ass:  # Covers exeptions during parsing
            raise DWARFParseError(ass, di)

        # A list containing variables in a disctionary
        # Description of used fields:
        # name: variable name
        # type: test description of the type
        # size_byte: size of the variable in bytes
        # size_bit: size of the variable in bits
        # bit_offset: Variable offset in the given byte / from variable default address
        # address: absolute address of the variable
        # children: a dictionary of child elements
        self.myVariables = list()
        self.specifications = dict()
        self.valid_addresses = dict()
        self.flat_list = []
        self.top_dies = [decorate_die(CU.get_top_DIE(), i) for (i, CU) in enumerate(di._CUs)]

        self.load_specifications()

        for top_die in self.top_dies:
            # top dies only contain Compile Units
            CU_name = safe_DIE_name(top_die, '?')

            # Preload children
            load_children(top_die)

            children_dies = list()

            for child_die in top_die._children:
                if child_die.tag == 'DW_TAG_variable' and 'DW_AT_specification' not in child_die.attributes:
                    entry = {
                        # Name should be on every element, if not set something so it can be printed
                        'name': safe_DIE_name(child_die, '?'),
                        'CU_name': CU_name
                    }

                    self.resolve_address(child_die, entry, entry)

                    if entry['name'] != '?' and entry['address']:
                        self.valid_addresses[entry['name']] = entry['address']

                    if 'DW_AT_type' in child_die.attributes:
                        typ_die = child_die.get_DIE_from_attribute('DW_AT_type')
                        self.truly_resolve_type(entry, typ_die)

                    children_dies.append(entry)
                elif child_die.tag == 'DW_TAG_subprogram':
                    children = self.load_subprogram(child_die, CU_name)
                    children_dies = children_dies + children

            self.myVariables.append({
                'name': CU_name,
                'children': children_dies
            })

    def load_subprogram(self, die, CU_name):
        # Preload children
        load_children(die)

        children_dies = list()

        for child_die in die._children:
            if child_die.tag == 'DW_TAG_variable' and 'DW_AT_specification' not in child_die.attributes:
                entry = {
                    # Name should be on every element, if not set something so it can be printed
                    'name': safe_DIE_name(child_die, '?'),
                    'CU_name': CU_name
                }

                self.resolve_address(child_die, entry, entry)

                # Check for static variable
                if entry['address'] < 0:
                    continue

                if entry['name'] != '?' and entry['address']:
                    self.valid_addresses[entry['name']] = entry['address']

                if 'DW_AT_type' in child_die.attributes:
                    typ_die = child_die.get_DIE_from_attribute('DW_AT_type')
                    self.truly_resolve_type(entry, typ_die)

                children_dies.append(entry)
            elif child_die.tag == 'DW_TAG_lexical_block':
                children = self.load_subprogram(child_die, CU_name)
                children_dies = children_dies + children

        return children_dies

    def load_specifications(self):
        for top_die in self.top_dies:
            CU_name = safe_DIE_name(top_die, '?')
            # preload children
            load_children(top_die)
            for child_die in top_die._children:
                if child_die.tag == 'DW_TAG_variable':
                    if 'DW_AT_specification' in child_die.attributes:
                        entry = {
                            'name': safe_DIE_name(child_die, '?')
                        }
                        self.resolve_address(child_die, entry, entry)
                        specs_for_die = child_die.get_DIE_from_attribute('DW_AT_specification')
                        name = safe_DIE_name(specs_for_die, '?')
                        self.specifications[name] = {
                            'address': entry['address'],
                            'CU_name': CU_name
                        }

    def resolve_address(self, die, entry_source, entry_dest):
        l_result = False
        at_name = ""
        base_address = 0
        address_spec = 0

        if 'DW_AT_member_location' in die.attributes:
            at_name = 'DW_AT_member_location'
        elif 'DW_AT_data_member_location' in die.attributes:
            at_name = 'DW_AT_data_member_location'
        elif 'DW_AT_location' in die.attributes:
            at_name = 'DW_AT_location'

        if entry_dest['name'] in self.specifications:
            if self.specifications[entry_dest['name']]['CU_name'] == entry_dest['CU_name']:
                address_spec = self.specifications[entry_dest['name']]['address']
        elif entry_source['name'] in self.specifications:
            if self.specifications[entry_source['name']]['CU_name'] == entry_source['CU_name']:
                address_spec = self.specifications[entry_source['name']]['address']

        if address_spec:
            base_address = address_spec
            l_result = True

        if 'address' in entry_source and not base_address:
            base_address = entry_source['address']

        entry_dest['address'] = base_address

        if at_name:
            if LocationParser.attribute_has_location(die.attributes[at_name], die.cu['version']):
                ll = self.parse_location(die, die.attributes[at_name])
                lloc = self.dump_expr(die, ll.loc_expr)
                entry_dest['address'] = base_address + lloc[0].args[0]
                l_result = True
            elif LocationParser._attribute_is_constant(die.attributes[at_name], die.cu['version']):
                entry_dest['address'] = base_address + die.attributes[at_name].value
                l_result = True
            else:
                eprint("Unsupported location information")
                eprint(at_name, die.cu['version'])

        return l_result

    def resolve_bit_size(self, die, entry):
        if 'DW_AT_bit_size' in die.attributes:
            entry['size_bit'] = die.attributes['DW_AT_bit_size'].value

        if 'DW_AT_data_bit_offset' in die.attributes:
            entry['bit_offset'] = die.attributes['DW_AT_data_bit_offset'].value
        elif 'DW_AT_bit_offset' in die.attributes:
            entry['bit_offset'] = die.attributes['DW_AT_bit_offset'].value

        if 'bit_offset' in entry:
            if entry['bit_offset'] & 0x100:
                entry['bit_offset'] = (entry['bit_offset'] & 0xFF)
            else:
                bit_size = (math.ceil((entry['bit_offset'] + 1) / 8) * 8) - 1
                bit_size_offset = (math.ceil(entry['size_bit'] / 8) * 8) - 1
                bit_size = bit_size if bit_size >= bit_size_offset else bit_size_offset

                entry['bit_offset'] = bit_size - entry['bit_offset'] - (entry['size_bit'] - 1)

            byte_offset = math.floor(entry['bit_offset'] / 8)
            entry['address'] = entry['address'] + byte_offset
            entry['bit_offset'] = entry['bit_offset'] - byte_offset * 8

    def truly_resolve_type(self, entry, die_type):
        if die_type.tag == 'DW_TAG_volatile_type':
            die_type = die_type.get_DIE_from_attribute('DW_AT_type')

        entry['type'] = safe_DIE_name(die_type, '?')

        self.resolve_address(die_type, entry, entry)

        if 'DW_AT_type' in die_type.attributes and die_type.tag not in supported_types:
            # Check if the type is a redefinition of a base type
            die_type_test = die_type
            while 'DW_AT_type' in die_type_test.attributes:
                die_type_test = die_type_test.get_DIE_from_attribute('DW_AT_type')
                if die_type_test.tag in supported_types:
                    die_type = die_type_test
                    break

        if 'DW_AT_byte_size' in die_type.attributes:
            entry['size_byte'] = die_type.attributes['DW_AT_byte_size'].value

        if die_type.tag == 'DW_TAG_base_type':
            real_type_name = safe_DIE_name(die_type, '?')
            if real_type_name != '?' and real_type_name != entry['type']:
                entry['type'] = '{name} ({real})'.format(name=entry['type'],
                                                         real=safe_DIE_name(die_type, '?'))
        elif die_type.tag == 'DW_TAG_structure_type':
            load_children(die_type)
            child_dies = []
            last_member_address = entry['address']
            last_member_size_byte = 0
            for child_die in die_type._children:
                if child_die.tag != 'DW_TAG_member' and not configuration['all_members']:
                    continue

                if 'DW_AT_type' in child_die.attributes:
                    typ_die = child_die.get_DIE_from_attribute('DW_AT_type')
                elif child_die.tag in supported_types:
                    typ_die = child_die
                else:
                    eprint('Child DIE with no type information')
                    epprint(child_die)
                    continue

                child_entry = dict()
                child_entry['name'] = safe_DIE_name(child_die, '?')

                self.resolve_address(child_die, entry, child_entry)

                self.resolve_bit_size(child_die, child_entry)
                self.truly_resolve_type(child_entry, typ_die)

                if child_entry['address'] != entry['address']:
                    last_member_address = child_entry['address']
                    if 'size_byte' in child_entry:
                        last_member_size_byte = child_entry['size_byte']
                else:
                    child_entry['address'] = last_member_address + last_member_size_byte

                child_dies.append(child_entry)
            entry['children'] = child_dies
        elif die_type.tag == 'DW_TAG_array_type':
            self.truly_resolve_type(entry, die_type.get_DIE_from_attribute('DW_AT_type'))
            load_children(die_type)
            entry['number_of_elements'] = 0

            if 'DW_AT_upper_bound' in die_type._children[0].attributes:
                upper_bound = die_type._children[0].attributes['DW_AT_upper_bound'].value
                if upper_bound < configuration['array_element_limit']:
                    entry['number_of_elements'] = upper_bound + 1
        elif die_type.tag == 'DW_TAG_union_type':
            load_children(die_type)
            child_entries = []
            for child_die in die_type._children:
                if child_die.tag != 'DW_TAG_member' and not configuration['all_members']:
                    continue

                if 'DW_AT_type' in child_die.attributes:
                    typ_die = child_die.get_DIE_from_attribute('DW_AT_type')
                elif child_die.tag in supported_types:
                    typ_die = child_die
                else:
                    eprint('Child DIE with no type information')
                    epprint(child_die)
                    continue

                child_entry = dict()
                child_entry['name'] = safe_DIE_name(child_die, '?')

                if 'address' in entry:
                    child_entry['address'] = entry['address']

                self.resolve_bit_size(child_die, child_entry)

                self.truly_resolve_type(child_entry, typ_die)

                child_entries.append(child_entry)
            entry['children'] = child_entries
        elif die_type.tag == 'DW_TAG_enumeration_type':
            entry['type'] = safe_DIE_name(die_type, 'ENUM')
        elif die_type.tag == 'DW_TAG_pointer_type':
            entry['type'] = safe_DIE_name(die_type, 'POINTER')
        else:
            eprint("Unsupported type:", die_type.tag)

    def normalize_by_address(self):
        validVariables = list()
        for CU in self.myVariables:
            child_entries = list()
            for child in CU['children']:
                if (child['name'] != '?') and (child['name'] in self.valid_addresses):
                    if child['address'] == self.valid_addresses[child['name']]:
                        child_entries.append(child)

            validVariables.append({
                'name': CU['name'],
                'children': child_entries
            })
        self.myVariables = validVariables

    def flatten_type(self, parent=None):
        for CU in self.myVariables:
            for child in CU['children']:
                if configuration["include_file_name"]:
                    self.pettanko(child, CU['name'], ":")
                else:
                    self.pettanko(child)

    def pettanko(self, entry : dict, a_parent_name : str = '', separator : str = '.', address_offset : int = 0):
        flat_entry = dict()
        if a_parent_name:
            flat_entry['name'] = '{parent}{separator}{child}'.format(parent=a_parent_name,
                                                                     separator=separator,
                                                                     child=entry['name'])
        else:
            flat_entry['name'] = entry['name']

        if 'address' in entry:
            flat_entry['address'] = entry['address'] + address_offset

        if 'size_byte' in entry:
            flat_entry['size_byte'] = entry['size_byte']
            if 'number_of_elements' in entry:
                flat_entry['size_byte'] = flat_entry['size_byte'] * entry['number_of_elements']

        if 'size_bit' in entry:
            flat_entry['size_bit'] = entry['size_bit']

        if 'bit_offset' in entry:
            flat_entry['bit_offset'] = entry['bit_offset']

        if 'type' in entry:
            flat_entry['type'] = entry['type']

        self.flat_list.append(flat_entry)

        if 'children' in entry and 'number_of_elements' not in entry:
            for kid in entry['children']:
                self.pettanko(entry=kid, a_parent_name=flat_entry['name'],
                              separator=separator, address_offset=address_offset)

        if 'number_of_elements' in entry:
            step = 1
            if configuration['no_array_expand']:
                if entry['number_of_elements'] > 1:
                    step = (entry['number_of_elements'] - 1)
                else:
                    step = entry['number_of_elements']

            if step == 0 or entry['number_of_elements'] == 0:
                return

            for index in range(0, entry['number_of_elements'], step):
                kid = flat_entry.copy()
                kid['name'] = '{name}[{index}]'.format(name=flat_entry['name'], index=index)

                if 'size_byte' in entry:
                    kid['size_byte'] = entry['size_byte']

                size_offset = address_offset
                if ('address' in flat_entry) and ('size_byte' in entry):
                    size_offset = size_offset + (entry['size_byte'] * index)
                    kid['address'] = flat_entry['address'] + size_offset

                self.pettanko(entry=kid, separator=separator, address_offset=0)

                kid_parent = self.flat_list[-1]

                if 'children' in entry:
                    for kid_of_kid in entry['children']:
                        self.pettanko(entry=kid_of_kid, a_parent_name=kid_parent['name'],
                                      separator=separator, address_offset=size_offset)

    def pretty_print(self):
        for entry in self.flat_list:
            address = 0
            if 'address' in entry:
                address = entry['address']

            if not address and entry['name'] == '?':
                continue

            size_byte = 1
            if 'size_byte' in entry:
                size_byte = entry['size_byte']

            if 'size_bit' in entry and 'bit_offset' in entry:
                bit_mask = 0

                for n in range(entry['size_bit']):
                    bit_mask = bit_mask | (1 << n)

                if entry['bit_offset'] >= 0:
                    bit_mask = bit_mask << entry['bit_offset']
                else:
                    eprint("Illegal offset {offset} for variable {variable}").format(
                        offset=entry['bit_offset'],
                        variable=entry['name']
                    )

                address = address + int(((entry['bit_offset'] + 1) / 8) - 1)

                print('{address}\t&{bit_mask}\t{variable_name}'.format(
                    address=hex(address),
                    variable_name=entry['name'],
                    bit_mask=hex(bit_mask)
                ))
            else:
                print('{address}\t{size_byte}\t{variable_name}'.format(
                    address=hex(address),
                    variable_name=entry['name'],
                    size_byte=size_byte
                ))

    def parse_location(self, die, attr):
        di = die.dwarfinfo
        if di._locparser is None:
            di._locparser = LocationParser(di.location_lists())
        return di._locparser.parse_from_attribute(attr, die.cu['version'], die=die)

    # Expr is an expression blob
    # Returns a list of strings for ops
    # Format: op arg, arg...
    def dump_expr(self, die, expr):
        if die.cu._exprparser is None:
            if die.cu['version'] > 1:
                die.cu._exprparser = DWARFExprParser(die.cu.structs)
            else:
                die.cu._exprparser = DWARFExprParserV1(die.cu.structs)

        # Challenge: for nested expressions, args is a list with a list of commands
        # For those, the format is: op {op arg, arg; op arg, arg}
        # Can't just check for iterable, str is iterable too
        return die.cu._exprparser.parse_expr(expr)

    def resolve_arch(self, arches):
        print("resolve_arch: Unsupported feature")
        return None


def main():
    from dwex.patch import monkeypatch
    monkeypatch()

    parser = argparse.ArgumentParser(
        description='Expands symbols (global variables) types.',
        epilog='Data displayed by this script are informative only!'
    )
    parser.add_argument('elf_file', metavar='file', type=str, nargs='?',
                        help='ELF file to try to extract symbols')
    parser.add_argument('-f', '--include-file-name', dest='include_file_name', action='store_true',
                        help='Display filename at the beginning')
    parser.add_argument('-d', '--print-debug-info', dest='print_debug_info', action='store_true',
                        help='Print debug infor when parsing DWARF')
    parser.add_argument('-a', '--all-members', dest='all_members', action='store_true',
                        help='Print information of all children')
    parser.add_argument('-e', '--no-array-expand', dest='no_array_expand', action='store_true',
                        help='Print only first and last array elements')
    parser.add_argument('--array-element-limit', dest='array_element_limit', action='store',
                        default=hex(configuration['array_element_limit']),
                        help='Maximum hexadecimal number of valid elements in an array')
    parser.add_argument('-n', '--disable-address-normalization', dest='address_normalization',
                        action='store_false',
                        help='Disable variable normalization by address, hides duplicate entries.')
    parser.add_argument('--no-mangled-names', dest='display_mangled_names', action='store_false',
                        help='Hide mangled names. By default mangled names are shown after the normal names')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {version}'.format(version=SCRIPT_VERSION))

    args = parser.parse_args()

    configuration['include_file_name'] = args.include_file_name
    configuration['print_debug_info'] = args.print_debug_info
    configuration['all_members'] = args.all_members
    configuration['no_array_expand'] = args.no_array_expand
    configuration['array_element_limit'] = int(args.array_element_limit, 16)
    configuration['address_normalization'] = args.address_normalization
    configuration['display_mangled_names'] = args.display_mangled_names

    if not args.elf_file:
        parser.print_help()
        exit(1)
    if not os.path.exists(args.elf_file):
        eprint('File {elf_file} does not exist!'.format(elf_file=args.elf_file))
        exit(1)

    bear = Bear(args.elf_file)

    if configuration['address_normalization']:
        bear.normalize_by_address()

    bear.flatten_type()
    bear.pretty_print()


if __name__ == "__main__":
    main()