From 488bec0457b96e9fc2a54243dd3a2eeeb436e8ab Mon Sep 17 00:00:00 2001 From: Anthony Mattas Date: Thu, 11 Jun 2026 09:47:04 -0400 Subject: [PATCH 1/2] Ghidra: read all comment types in _comments(), fix EOL/PRE loss _comments() read only EOL and PRE comments and let PRE silently overwrite EOL at the same address, and never read PLATE comments -- so function-level comments written by _set_comment could not be read back out. Read all five Ghidra comment types per code unit and join the populated ones, tagging entry-point PLATE comments with func_addr so they round-trip through _set_comment. Iterate functions directly for func_addr context and drop the now-unused __function_code_units helper. --- declib/decompilers/ghidra/interface.py | 75 ++++++++++++++++---------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/declib/decompilers/ghidra/interface.py b/declib/decompilers/ghidra/interface.py index 4e54684..bf1293f 100644 --- a/declib/decompilers/ghidra/interface.py +++ b/declib/decompilers/ghidra/interface.py @@ -838,25 +838,54 @@ def _get_comment(self, addr) -> Optional[Comment]: return comments.get(addr, None) def _comments(self) -> Dict[int, Comment]: + from .compat.imports import CodeUnit + + # Ghidra stores up to five distinct comment types per code unit, but the declib Comment + # model holds a single string per address. The previous implementation only read EOL and + # PRE comments and let PRE silently clobber EOL at the same address; it also never read + # PLATE comments, so function-level comments written by _set_comment could not be read + # back. We now capture every populated type and join them in display order so no comment + # text is dropped. A PLATE comment on a function's entry instruction is the function-level + # comment, so it is tagged with func_addr to round-trip through _set_comment as a PLATE. + ordered_types = ( + CodeUnit.PLATE_COMMENT, + CodeUnit.PRE_COMMENT, + CodeUnit.EOL_COMMENT, + CodeUnit.POST_COMMENT, + CodeUnit.REPEATABLE_COMMENT, + ) + decompiled_types = {CodeUnit.PLATE_COMMENT, CodeUnit.PRE_COMMENT} + comments = {} - funcs_code_units = self.__function_code_units() - for code_units in funcs_code_units: - for code_unit in code_units: - # TODO: this could be bad if we have multiple comments at the same address (pre and eol) - # eol comment - eol_cmt = code_unit.getComment(0) - if eol_cmt: - addr = int(code_unit.getAddress().getOffset()) - comments[addr] = Comment( - addr=addr, comment=str(eol_cmt) - ) - # pre comment - pre_cmt = code_unit.getComment(1) - if pre_cmt: - addr = int(code_unit.getAddress().getOffset()) - comments[addr] = Comment( - addr=addr, comment=str(pre_cmt), decompiled=True - ) + listing = self.currentProgram.getListing() + for func in self.currentProgram.getFunctionManager().getFunctions(True): + func_addr = int(func.getEntryPoint().getOffset()) + for code_unit in listing.getCodeUnits(func.getBody(), True): + parts = [] + decompiled = False + has_plate = False + for cmt_type in ordered_types: + text = code_unit.getComment(cmt_type) + if not text: + continue + parts.append(str(text)) + decompiled |= cmt_type in decompiled_types + has_plate |= cmt_type == CodeUnit.PLATE_COMMENT + + if not parts: + continue + + addr = int(code_unit.getAddress().getOffset()) + # Only a PLATE comment sitting on the entry instruction is the function-level + # comment; tagging func_addr makes _set_comment re-apply it as a PLATE. Other + # comments at the entry must keep func_addr=None so they are not promoted to PLATE. + comment_func_addr = func_addr if (has_plate and addr == func_addr) else None + comments[addr] = Comment( + addr=addr, + func_addr=comment_func_addr, + comment="\n".join(parts), + decompiled=decompiled, + ) return comments @@ -1421,13 +1450,3 @@ def __gtypedefs(self): if isinstance(typedef, TypedefDB) ] - - def __function_code_units(self): - """ - Returns a list of code units for each function in the program. - """ - return [ - [code_unit for code_unit in self.currentProgram.getListing().getCodeUnits(func.getBody(), True)] - for func in self.currentProgram.getFunctionManager().getFunctions(True) - ] - From c2cc283b0bcb6d6906c3d1905822001e5bc153bd Mon Sep 17 00:00:00 2001 From: Anthony Mattas Date: Sun, 14 Jun 2026 15:43:44 -0400 Subject: [PATCH 2/2] Fix Ghidra comment read type mapping --- declib/decompilers/ghidra/interface.py | 49 +++---- tests/test_ghidra_comments.py | 176 +++++++++++++++++++++++++ 2 files changed, 196 insertions(+), 29 deletions(-) create mode 100644 tests/test_ghidra_comments.py diff --git a/declib/decompilers/ghidra/interface.py b/declib/decompilers/ghidra/interface.py index bf1293f..a0cb646 100644 --- a/declib/decompilers/ghidra/interface.py +++ b/declib/decompilers/ghidra/interface.py @@ -840,51 +840,43 @@ def _get_comment(self, addr) -> Optional[Comment]: def _comments(self) -> Dict[int, Comment]: from .compat.imports import CodeUnit - # Ghidra stores up to five distinct comment types per code unit, but the declib Comment - # model holds a single string per address. The previous implementation only read EOL and - # PRE comments and let PRE silently clobber EOL at the same address; it also never read - # PLATE comments, so function-level comments written by _set_comment could not be read - # back. We now capture every populated type and join them in display order so no comment - # text is dropped. A PLATE comment on a function's entry instruction is the function-level - # comment, so it is tagged with func_addr to round-trip through _set_comment as a PLATE. + # Ghidra stores multiple comment slots per code unit, while declib has one + # portable Comment per address. Preserve all populated text and label the + # slots when more than one has to collapse into the same Comment. ordered_types = ( - CodeUnit.PLATE_COMMENT, - CodeUnit.PRE_COMMENT, - CodeUnit.EOL_COMMENT, - CodeUnit.POST_COMMENT, - CodeUnit.REPEATABLE_COMMENT, + (CodeUnit.PLATE_COMMENT, "PLATE", False), + (CodeUnit.PRE_COMMENT, "PRE", True), + (CodeUnit.EOL_COMMENT, "EOL", False), + (CodeUnit.POST_COMMENT, "POST", False), + (CodeUnit.REPEATABLE_COMMENT, "REPEATABLE", False), ) - decompiled_types = {CodeUnit.PLATE_COMMENT, CodeUnit.PRE_COMMENT} comments = {} listing = self.currentProgram.getListing() for func in self.currentProgram.getFunctionManager().getFunctions(True): - func_addr = int(func.getEntryPoint().getOffset()) for code_unit in listing.getCodeUnits(func.getBody(), True): - parts = [] - decompiled = False - has_plate = False - for cmt_type in ordered_types: + comment_entries = [] + for cmt_type, label, is_decompiled_type in ordered_types: text = code_unit.getComment(cmt_type) if not text: continue - parts.append(str(text)) - decompiled |= cmt_type in decompiled_types - has_plate |= cmt_type == CodeUnit.PLATE_COMMENT + comment_entries.append((label, str(text), is_decompiled_type)) - if not parts: + if not comment_entries: continue + should_prefix = len(comment_entries) > 1 + parts = [ + f"[{label}] {text}" if should_prefix else text + for label, text, _ in comment_entries + ] + has_decompiled = any(is_decompiled for _, _, is_decompiled in comment_entries) + has_disassembly = any(not is_decompiled for _, _, is_decompiled in comment_entries) addr = int(code_unit.getAddress().getOffset()) - # Only a PLATE comment sitting on the entry instruction is the function-level - # comment; tagging func_addr makes _set_comment re-apply it as a PLATE. Other - # comments at the entry must keep func_addr=None so they are not promoted to PLATE. - comment_func_addr = func_addr if (has_plate and addr == func_addr) else None comments[addr] = Comment( addr=addr, - func_addr=comment_func_addr, comment="\n".join(parts), - decompiled=decompiled, + decompiled=has_decompiled and not has_disassembly, ) return comments @@ -1449,4 +1441,3 @@ def __gtypedefs(self): for typedef in self.currentProgram.getDataTypeManager().getAllDataTypes() if isinstance(typedef, TypedefDB) ] - diff --git a/tests/test_ghidra_comments.py b/tests/test_ghidra_comments.py new file mode 100644 index 0000000..b49d25f --- /dev/null +++ b/tests/test_ghidra_comments.py @@ -0,0 +1,176 @@ +import sys +import types +import unittest + + +_MISSING = object() + + +class FakeCodeUnit: + EOL_COMMENT = 0 + PRE_COMMENT = 1 + POST_COMMENT = 2 + PLATE_COMMENT = 3 + REPEATABLE_COMMENT = 4 + + +def _restore_modules(saved_modules): + for name, module in saved_modules.items(): + if module is _MISSING: + sys.modules.pop(name, None) + else: + sys.modules[name] = module + + +def _install_ghidra_import_stubs(): + module_names = ( + "pyghidra", + "pyghidra.core", + "jpype", + "declib.decompilers.ghidra.compat.imports", + "declib.decompilers.ghidra.compat.headless", + "declib.decompilers.ghidra.interface", + ) + saved_modules = {name: sys.modules.get(name, _MISSING) for name in module_names} + + pyghidra_mod = types.ModuleType("pyghidra") + pyghidra_core_mod = types.ModuleType("pyghidra.core") + pyghidra_core_mod._analyze_program = lambda *args, **kwargs: None + pyghidra_core_mod._get_language = lambda *args, **kwargs: None + pyghidra_core_mod._get_compiler_spec = lambda *args, **kwargs: None + sys.modules.setdefault("pyghidra", pyghidra_mod) + sys.modules.setdefault("pyghidra.core", pyghidra_core_mod) + + jpype_mod = types.ModuleType("jpype") + jpype_mod.JClass = type + sys.modules.setdefault("jpype", jpype_mod) + + compat_imports_mod = types.ModuleType("declib.decompilers.ghidra.compat.imports") + compat_imports_mod.CodeUnit = FakeCodeUnit + sys.modules["declib.decompilers.ghidra.compat.imports"] = compat_imports_mod + return saved_modules + + +class FakeAddress: + def __init__(self, offset): + self._offset = offset + + def getOffset(self): + return self._offset + + +class FakeFunction: + def __init__(self, entry, body): + self._entry = entry + self._body = body + + def getEntryPoint(self): + return FakeAddress(self._entry) + + def getBody(self): + return self._body + + +class FakeCodeUnitInstance: + def __init__(self, addr, comments): + self._addr = addr + self._comments = comments + + def getAddress(self): + return FakeAddress(self._addr) + + def getComment(self, comment_type): + return self._comments.get(comment_type) + + +class FakeListing: + def __init__(self, code_units_by_body): + self._code_units_by_body = code_units_by_body + + def getCodeUnits(self, body, forward): + return iter(self._code_units_by_body[body]) + + +class FakeFunctionManager: + def __init__(self, funcs): + self._funcs = funcs + + def getFunctions(self, forward): + return iter(self._funcs) + + +class FakeProgram: + def __init__(self, funcs, listing): + self._func_manager = FakeFunctionManager(funcs) + self._listing = listing + + def getFunctionManager(self): + return self._func_manager + + def getListing(self): + return self._listing + + +class TestGhidraComments(unittest.TestCase): + def _make_interface(self, code_units): + saved_modules = _install_ghidra_import_stubs() + self.addCleanup(_restore_modules, saved_modules) + from declib.decompilers.ghidra.interface import GhidraDecompilerInterface + + class TestableGhidraDecompilerInterface(GhidraDecompilerInterface): + @property + def currentProgram(self): + return self._program_for_test + + func = FakeFunction(0x401000, "main_body") + deci = object.__new__(TestableGhidraDecompilerInterface) + deci._program_for_test = FakeProgram( + [func], + FakeListing({"main_body": code_units}), + ) + return deci + + def test_comments_maps_ghidra_slots_to_portable_comment_kinds(self): + deci = self._make_interface([ + FakeCodeUnitInstance( + 0x401000, + { + FakeCodeUnit.PLATE_COMMENT: "plate", + FakeCodeUnit.EOL_COMMENT: "eol", + FakeCodeUnit.POST_COMMENT: "post", + FakeCodeUnit.REPEATABLE_COMMENT: "repeatable", + }, + ), + FakeCodeUnitInstance(0x401010, {FakeCodeUnit.PRE_COMMENT: "pre"}), + FakeCodeUnitInstance( + 0x401020, + { + FakeCodeUnit.PRE_COMMENT: "pre", + FakeCodeUnit.EOL_COMMENT: "eol", + }, + ), + ]) + + comments = deci._comments() + + disassembly_comment = comments[0x401000] + self.assertIsNone(disassembly_comment.func_addr) + self.assertFalse(disassembly_comment.decompiled) + self.assertEqual( + disassembly_comment.comment, + "[PLATE] plate\n[EOL] eol\n[POST] post\n[REPEATABLE] repeatable", + ) + + pseudocode_comment = comments[0x401010] + self.assertIsNone(pseudocode_comment.func_addr) + self.assertTrue(pseudocode_comment.decompiled) + self.assertEqual(pseudocode_comment.comment, "pre") + + mixed_comment = comments[0x401020] + self.assertIsNone(mixed_comment.func_addr) + self.assertFalse(mixed_comment.decompiled) + self.assertEqual(mixed_comment.comment, "[PRE] pre\n[EOL] eol") + + +if __name__ == "__main__": + unittest.main()