Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion src/tocode/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def collect(self) -> ProgramAnalysis:
or f"{self.session.backend_label} auto-analysis"
)
self.progress.log(f"Analyzing with {label}")
with self.progress.bar(total=15, desc="analyze", unit="step") as bar:
with self.progress.bar(total=16, desc="analyze", unit="step") as bar:
self.session.analyze()
bar.update(1)
info = self.session.info()
Expand All @@ -92,6 +92,8 @@ def collect(self) -> ProgramAnalysis:
bar.update(1)
flags = [self._flag(row) for row in self.session.flags()]
bar.update(1)
data_xrefs = self._data_xrefs(symbols, relocations, strings, flags)
bar.update(1)
routines = self._routines(self.session.functions(), imports, segments)
bar.update(1)
callees, callers, import_calls = self._call_graph(routines, imports)
Expand Down Expand Up @@ -128,6 +130,7 @@ def collect(self) -> ProgramAnalysis:
import_calls=import_calls,
roots=roots,
thunks=thunks,
data_xrefs=data_xrefs,
)
self.analysis = analysis
self.analysis_seconds = time.monotonic() - started
Expand Down Expand Up @@ -249,6 +252,28 @@ def _routines(
)
return result

def _data_xrefs(
self,
symbols: list[SymbolEntry],
relocations: list[RelocationEntry],
strings: list[StringEntry],
flags: list[FlagEntry],
) -> dict[int, list[tuple[int, bool]]]:
collect = getattr(self.session, "data_xrefs", None)
if not callable(collect):
return {}
addresses: set[int] = set()
addresses.update(item.vaddr for item in strings)
addresses.update(item.vaddr for item in symbols if not item.imported)
addresses.update(item.vaddr for item in relocations)
addresses.update(item.offset for item in flags)
if not addresses:
return {}
try:
return collect(addresses)
except Exception: # noqa: BLE001
return {}

def _call_graph(
self,
routines: dict[int, Routine],
Expand Down
2 changes: 2 additions & 0 deletions src/tocode/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def calls_from(
self, address: int, imports: dict[int, Any], functions: dict[int, Any]
) -> tuple[list[int], list[str]]: ...

def data_xrefs(self, addresses: Any) -> dict[int, list[tuple[int, bool]]]: ...


@dataclass(slots=True)
class IdaProbe:
Expand Down
82 changes: 44 additions & 38 deletions src/tocode/backends/ida.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def __init__(
self._ida_fixup = self._optional_import("ida_fixup")
self._ida_auto = self._optional_import("ida_auto")
self._ida_nalt = self._optional_import("ida_nalt")
self._ida_xref = self._optional_import("ida_xref")
self._db: Any = None

if db_path is None:
Expand Down Expand Up @@ -118,13 +119,9 @@ def __init__(

self._strings_ready = False
self._decompiler_ready = False
self._disasm_cache: dict[int, str] = {}
self._decompile_cache: dict[int, str] = {}
self._summary_cache: dict[int, str] = {}
self._locals_cache: dict[int, list[Any]] = {}
self._imports_cache: list[dict[str, Any]] | None = None
self._relocs_cache: list[dict[str, Any]] | None = None
self._primed: set[int] = set()

def _optional_import(self, module: str):
try:
Expand All @@ -145,23 +142,36 @@ def _wait_for_auto_analysis(self) -> None:
def analyze(self) -> None:
if self._strings_ready:
return
try:
from ida_domain.strings import StringListConfig, StringType
# An already-analyzed database (e.g. a reused `.i64`) usually carries a
# populated string list, so rescanning the whole image is wasted work.
# Only rebuild when the list is empty; never skip when there is nothing
# to lose.
if not self._has_strings():
try:
from ida_domain.strings import StringListConfig, StringType

self._db.strings.rebuild(
StringListConfig(
string_types=[StringType.C, StringType.C_16],
min_len=4,
only_ascii_7bit=False,
self._db.strings.rebuild(
StringListConfig(
string_types=[StringType.C, StringType.C_16],
min_len=4,
only_ascii_7bit=False,
)
)
)
except Exception: # noqa: BLE001
try:
self._db.strings.rebuild()
except Exception: # noqa: BLE001
pass
try:
self._db.strings.rebuild()
except Exception: # noqa: BLE001
pass
self._strings_ready = True

def _has_strings(self) -> bool:
try:
for _ in self._db.strings:
return True
except Exception: # noqa: BLE001
return False
return False

def close(self) -> None:
if self._db is None:
return
Expand Down Expand Up @@ -212,9 +222,6 @@ def restore_parallel_resources(self) -> None:
self._open_existing_database(resolved_db)

def release_render_memory(self) -> None:
self._disasm_cache.clear()
self._decompile_cache.clear()
self._summary_cache.clear()
self._locals_cache.clear()
if self._ida_hexrays is None:
return
Expand Down Expand Up @@ -256,11 +263,7 @@ def _open_existing_database(self, resolved_db: Path) -> None:

def _clear_caches(self) -> None:
self._decompiler_ready = False
self._disasm_cache.clear()
self._decompile_cache.clear()
self._summary_cache.clear()
self._locals_cache.clear()
self._primed.clear()

def worker(self) -> "IdaSession":
if self._cache_db is not None and self._cache_db.exists():
Expand Down Expand Up @@ -505,8 +508,6 @@ def decompile(self, address: int) -> str:
return "\n".join(lines) if isinstance(lines, list) else str(lines)

def function_summary(self, address: int) -> str:
if address in self._summary_cache:
return self._summary_cache[address]
func = self._need_function(address)
signature = self._db.functions.get_signature(
func
Expand Down Expand Up @@ -557,6 +558,24 @@ def calls_from(
imported.add(name)
return sorted(edges), sorted(name for name in imported if name)

def data_xrefs(self, addresses: Any) -> dict[int, list[tuple[int, bool]]]:
if self._ida_xref is None:
return {}
xref = self._ida_xref
write_type = int(getattr(xref, "dr_W", 2))
result: dict[int, list[tuple[int, bool]]] = {}
for address in addresses:
target = int(address)
refs: list[tuple[int, bool]] = []
block = xref.xrefblk_t()
ok = block.first_to(target, xref.XREF_DATA)
while ok:
refs.append((int(block.frm), int(block.type) == write_type))
ok = block.next_to()
if refs:
result[target] = refs
return result

def _resolve_thunk(self, func: Any) -> Any:
from ida_domain.functions import FunctionFlags

Expand All @@ -581,19 +600,6 @@ def _need_function(self, address: int):
raise BackendError(f"IDA could not resolve function at 0x{address:x}")
return func

def _prime(self, address: int) -> None:
if address in self._primed:
return
if self._ida_hexrays is not None:
try:
self.ensure_decompiler()
self._function_pseudocode(self._need_function(address))
except Exception: # noqa: BLE001
pass
self._locals_cache.pop(address, None)
self._summary_cache.pop(address, None)
self._primed.add(address)

def _locals(self, address: int) -> list[Any]:
if address not in self._locals_cache:
try:
Expand Down
18 changes: 18 additions & 0 deletions src/tocode/backends/r2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,24 @@ def _disasm_json(self, address: int) -> dict[str, Any]:
self._pdfj[address] = self.cmdj(f"pdfj @ 0x{address:x}") or {}
return self._pdfj[address]

def data_xrefs(self, addresses) -> dict[int, list[tuple[int, bool]]]:
result: dict[int, list[tuple[int, bool]]] = {}
for address in addresses:
target = int(address)
rows = self.cmdj(f"axtj @ 0x{target:x}") or []
refs: list[tuple[int, bool]] = []
for row in rows:
frm = row.get("from")
if frm is None:
continue
kind = str(row.get("type", "")).lower()
perm = str(row.get("perm", "")).lower()
is_write = "w" in perm or kind == "write"
refs.append((int(frm), is_write))
if refs:
result[target] = refs
return result

def calls_from(
self, address: int, imports, functions
) -> tuple[list[int], list[str]]:
Expand Down
6 changes: 6 additions & 0 deletions src/tocode/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ def build_parser() -> argparse.ArgumentParser:
action="store_true",
help="Also write tree-sitter/Semgrep friendly source under src/tree.",
)
parser.add_argument(
"--entropy",
action="store_true",
help="Compute per-section Shannon entropy (off by default; slow on large binaries).",
)
parser.add_argument(
"-q",
"--quiet",
Expand Down Expand Up @@ -130,4 +135,5 @@ def _run_one(
progress=progress,
jobs=args.jobs,
tree=args.tree,
entropy=args.entropy,
)
61 changes: 55 additions & 6 deletions src/tocode/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class ExportContext:
out_dir: Path | None
jobs: int | None
tree_enabled: bool
entropy_enabled: bool = False
analysis: ProgramAnalysis | None = None
root: Path | None = None
raw_dir: Path | None = None
Expand Down Expand Up @@ -146,6 +147,7 @@ def export_binary(
progress: Progress | None = None,
jobs: int | None = None,
tree: bool = False,
entropy: bool = False,
) -> ExportSummary:
progress = progress or analyzer.progress
context = ExportContext(
Expand All @@ -154,6 +156,7 @@ def export_binary(
out_dir=out_dir,
jobs=jobs,
tree_enabled=tree,
entropy_enabled=entropy,
)
_prepare_tree(context)
_cluster(context)
Expand Down Expand Up @@ -244,6 +247,13 @@ def _select_render_workers(context: ExportContext) -> None:
else None,
)
context.render_mode = "process" if context.worker_count > 1 else "single"
if is_ida and context.jobs is not None and context.worker_count < context.jobs:
context.progress.log(
f"Note: limiting to {context.worker_count} worker(s) instead of the "
f"requested {context.jobs} to fit available memory "
f"(each IDA worker loads the whole database; override with "
f"TOCODE_IDA_WORKER_MEMORY_MB)."
)
context.progress.log(
describe_jobs(
function_count=count,
Expand Down Expand Up @@ -871,6 +881,8 @@ def build_tree_cluster_file(
else raw_resolved.with_suffix(".asm"),
asm_line_start=raw_range.asm_line_start if raw_range is not None else 1,
asm_line_end=raw_range.asm_line_end if raw_range is not None else 1,
arg_count=raw_range.arg_count if raw_range is not None else None,
local_count=raw_range.local_count if raw_range is not None else None,
)
)
c_line = end + 2
Expand Down Expand Up @@ -937,6 +949,7 @@ def build_cluster_files(
_summary_function(routine, summary_path, item.summary_text).rstrip()
+ "\n\n"
)
arg_count, local_count = _counts_from_summary(item.summary_text)
ranges.append(
FunctionRange(
address=address,
Expand All @@ -947,6 +960,8 @@ def build_cluster_files(
asm_file=asm_resolved,
asm_line_start=asm_start,
asm_line_end=asm_end,
arg_count=arg_count,
local_count=local_count,
)
)
c_line = c_end + 2
Expand All @@ -961,6 +976,35 @@ def build_cluster_files(
}


def _counts_from_summary(summary_text: str) -> tuple[int | None, int | None]:
"""Recover the argument and local counts the backend reported in a summary.

The decompiler computes these while rendering, so reading them back from the
summary avoids re-deriving them (which, for IDA, would mean decompiling every
function a second time during inventory). Returns ``(None, None)`` when the
summary does not carry the fields, so callers can fall back to inventory data.
"""
args: int | None = None
locals_: int | None = None
for line in summary_text.splitlines():
key, sep, value = line.partition(":")
if not sep:
continue
label = key.strip()
if label == "args":
args = _safe_int(value)
elif label == "locals":
locals_ = _safe_int(value)
return args, locals_


def _safe_int(value: str) -> int | None:
try:
return int(value.strip())
except (TypeError, ValueError):
return None


def render_functions(
*,
analyzer: BinaryAnalyzer,
Expand Down Expand Up @@ -1450,7 +1494,7 @@ def write_header() -> None:

def write_variables() -> None:
context.data_variable_count = export_variables(
analysis, root, context.raw_ranges
analysis, root, entropy=context.entropy_enabled
)

def write_indexes() -> None:
Expand Down Expand Up @@ -1484,7 +1528,11 @@ def write_triage() -> None:
write_json(
root / "triage.json",
triage_json(
analysis, context.clusters, context.raw_ranges, shared["reachable"]
analysis,
context.clusters,
context.raw_ranges,
shared["reachable"],
entropy=context.entropy_enabled,
),
)

Expand All @@ -1506,13 +1554,14 @@ def write_docs() -> None:
("function index", write_indexes),
(
"sections.json",
lambda: write_json(root / "sections.json", sections_json(analysis)),
lambda: write_json(
root / "sections.json",
sections_json(analysis, entropy=context.entropy_enabled),
),
),
(
"strings.json",
lambda: write_json(
root / "strings.json", strings_json(analysis, context.raw_ranges)
),
lambda: write_json(root / "strings.json", strings_json(analysis)),
),
(
"imports.json",
Expand Down
Loading
Loading