buzzer-re · buzzer-re · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/src/tocode/analysis.py b/src/tocode/analysis.py
@@ -69,7 +69,7 @@ def collect(self) -> ProgramAnalysis:
             or f"{self.session.backend_label} auto-analysis"
         )
         self.progress.log(f"Analyzing with {label}")
-        with self.progress.bar(total=15, desc="analyze", unit="step") as bar:
+        with self.progress.bar(total=16, desc="analyze", unit="step") as bar:
             self.session.analyze()
             bar.update(1)
             info = self.session.info()
@@ -92,6 +92,8 @@ def collect(self) -> ProgramAnalysis:
             bar.update(1)
             flags = [self._flag(row) for row in self.session.flags()]
             bar.update(1)
+            data_xrefs = self._data_xrefs(symbols, relocations, strings, flags)
+            bar.update(1)
             routines = self._routines(self.session.functions(), imports, segments)
             bar.update(1)
             callees, callers, import_calls = self._call_graph(routines, imports)
@@ -128,6 +130,7 @@ def collect(self) -> ProgramAnalysis:
             import_calls=import_calls,
             roots=roots,
             thunks=thunks,
+            data_xrefs=data_xrefs,
         )
         self.analysis = analysis
         self.analysis_seconds = time.monotonic() - started
@@ -249,6 +252,28 @@ def _routines(
             )
         return result
 
+    def _data_xrefs(
+        self,
+        symbols: list[SymbolEntry],
+        relocations: list[RelocationEntry],
+        strings: list[StringEntry],
+        flags: list[FlagEntry],
+    ) -> dict[int, list[tuple[int, bool]]]:
+        collect = getattr(self.session, "data_xrefs", None)
+        if not callable(collect):
+            return {}
+        addresses: set[int] = set()
+        addresses.update(item.vaddr for item in strings)
+        addresses.update(item.vaddr for item in symbols if not item.imported)
+        addresses.update(item.vaddr for item in relocations)
+        addresses.update(item.offset for item in flags)
+        if not addresses:
+            return {}
+        try:
+            return collect(addresses)
+        except Exception:  # noqa: BLE001
+            return {}
+
     def _call_graph(
         self,
         routines: dict[int, Routine],

diff --git a/src/tocode/backends/base.py b/src/tocode/backends/base.py
@@ -59,6 +59,8 @@ def calls_from(
         self, address: int, imports: dict[int, Any], functions: dict[int, Any]
     ) -> tuple[list[int], list[str]]: ...
 
+    def data_xrefs(self, addresses: Any) -> dict[int, list[tuple[int, bool]]]: ...
+
 
 @dataclass(slots=True)
 class IdaProbe:

diff --git a/src/tocode/backends/ida.py b/src/tocode/backends/ida.py
@@ -74,6 +74,7 @@ def __init__(
         self._ida_fixup = self._optional_import("ida_fixup")
         self._ida_auto = self._optional_import("ida_auto")
         self._ida_nalt = self._optional_import("ida_nalt")
+        self._ida_xref = self._optional_import("ida_xref")
         self._db: Any = None
 
         if db_path is None:
@@ -118,13 +119,9 @@ def __init__(
 
         self._strings_ready = False
         self._decompiler_ready = False
-        self._disasm_cache: dict[int, str] = {}
-        self._decompile_cache: dict[int, str] = {}
-        self._summary_cache: dict[int, str] = {}
         self._locals_cache: dict[int, list[Any]] = {}
         self._imports_cache: list[dict[str, Any]] | None = None
         self._relocs_cache: list[dict[str, Any]] | None = None
-        self._primed: set[int] = set()
 
     def _optional_import(self, module: str):
         try:
@@ -145,23 +142,36 @@ def _wait_for_auto_analysis(self) -> None:
     def analyze(self) -> None:
         if self._strings_ready:
             return
-        try:
-            from ida_domain.strings import StringListConfig, StringType
+        # An already-analyzed database (e.g. a reused `.i64`) usually carries a
+        # populated string list, so rescanning the whole image is wasted work.
+        # Only rebuild when the list is empty; never skip when there is nothing
+        # to lose.
+        if not self._has_strings():
+            try:
+                from ida_domain.strings import StringListConfig, StringType
 
-            self._db.strings.rebuild(
-                StringListConfig(
-                    string_types=[StringType.C, StringType.C_16],
-                    min_len=4,
-                    only_ascii_7bit=False,
+                self._db.strings.rebuild(
+                    StringListConfig(
+                        string_types=[StringType.C, StringType.C_16],
+                        min_len=4,
+                        only_ascii_7bit=False,
+                    )
                 )
-            )
-        except Exception:  # noqa: BLE001
-            try:
-                self._db.strings.rebuild()
             except Exception:  # noqa: BLE001
-                pass
+                try:
+                    self._db.strings.rebuild()
+                except Exception:  # noqa: BLE001
+                    pass
         self._strings_ready = True
 
+    def _has_strings(self) -> bool:
+        try:
+            for _ in self._db.strings:
+                return True
+        except Exception:  # noqa: BLE001
+            return False
+        return False
+
     def close(self) -> None:
         if self._db is None:
             return
@@ -212,9 +222,6 @@ def restore_parallel_resources(self) -> None:
         self._open_existing_database(resolved_db)
 
     def release_render_memory(self) -> None:
-        self._disasm_cache.clear()
-        self._decompile_cache.clear()
-        self._summary_cache.clear()
         self._locals_cache.clear()
         if self._ida_hexrays is None:
             return
@@ -256,11 +263,7 @@ def _open_existing_database(self, resolved_db: Path) -> None:
 
     def _clear_caches(self) -> None:
         self._decompiler_ready = False
-        self._disasm_cache.clear()
-        self._decompile_cache.clear()
-        self._summary_cache.clear()
         self._locals_cache.clear()
-        self._primed.clear()
 
     def worker(self) -> "IdaSession":
         if self._cache_db is not None and self._cache_db.exists():
@@ -505,8 +508,6 @@ def decompile(self, address: int) -> str:
         return "\n".join(lines) if isinstance(lines, list) else str(lines)
 
     def function_summary(self, address: int) -> str:
-        if address in self._summary_cache:
-            return self._summary_cache[address]
         func = self._need_function(address)
         signature = self._db.functions.get_signature(
             func
@@ -557,6 +558,24 @@ def calls_from(
                     imported.add(name)
         return sorted(edges), sorted(name for name in imported if name)
 
+    def data_xrefs(self, addresses: Any) -> dict[int, list[tuple[int, bool]]]:
+        if self._ida_xref is None:
+            return {}
+        xref = self._ida_xref
+        write_type = int(getattr(xref, "dr_W", 2))
+        result: dict[int, list[tuple[int, bool]]] = {}
+        for address in addresses:
+            target = int(address)
+            refs: list[tuple[int, bool]] = []
+            block = xref.xrefblk_t()
+            ok = block.first_to(target, xref.XREF_DATA)
+            while ok:
+                refs.append((int(block.frm), int(block.type) == write_type))
+                ok = block.next_to()
+            if refs:
+                result[target] = refs
+        return result
+
     def _resolve_thunk(self, func: Any) -> Any:
         from ida_domain.functions import FunctionFlags
 
@@ -581,19 +600,6 @@ def _need_function(self, address: int):
             raise BackendError(f"IDA could not resolve function at 0x{address:x}")
         return func
 
-    def _prime(self, address: int) -> None:
-        if address in self._primed:
-            return
-        if self._ida_hexrays is not None:
-            try:
-                self.ensure_decompiler()
-                self._function_pseudocode(self._need_function(address))
-            except Exception:  # noqa: BLE001
-                pass
-        self._locals_cache.pop(address, None)
-        self._summary_cache.pop(address, None)
-        self._primed.add(address)
-
     def _locals(self, address: int) -> list[Any]:
         if address not in self._locals_cache:
             try:

diff --git a/src/tocode/backends/r2.py b/src/tocode/backends/r2.py
@@ -139,6 +139,24 @@ def _disasm_json(self, address: int) -> dict[str, Any]:
             self._pdfj[address] = self.cmdj(f"pdfj @ 0x{address:x}") or {}
         return self._pdfj[address]
 
+    def data_xrefs(self, addresses) -> dict[int, list[tuple[int, bool]]]:
+        result: dict[int, list[tuple[int, bool]]] = {}
+        for address in addresses:
+            target = int(address)
+            rows = self.cmdj(f"axtj @ 0x{target:x}") or []
+            refs: list[tuple[int, bool]] = []
+            for row in rows:
+                frm = row.get("from")
+                if frm is None:
+                    continue
+                kind = str(row.get("type", "")).lower()
+                perm = str(row.get("perm", "")).lower()
+                is_write = "w" in perm or kind == "write"
+                refs.append((int(frm), is_write))
+            if refs:
+                result[target] = refs
+        return result
+
     def calls_from(
         self, address: int, imports, functions
     ) -> tuple[list[int], list[str]]:

diff --git a/src/tocode/cli.py b/src/tocode/cli.py
@@ -78,6 +78,11 @@ def build_parser() -> argparse.ArgumentParser:
         action="store_true",
         help="Also write tree-sitter/Semgrep friendly source under src/tree.",
     )
+    parser.add_argument(
+        "--entropy",
+        action="store_true",
+        help="Compute per-section Shannon entropy (off by default; slow on large binaries).",
+    )
     parser.add_argument(
         "-q",
         "--quiet",
@@ -130,4 +135,5 @@ def _run_one(
             progress=progress,
             jobs=args.jobs,
             tree=args.tree,
+            entropy=args.entropy,
         )
diff --git a/src/tocode/exporter.py b/src/tocode/exporter.py
@@ -89,6 +89,7 @@ class ExportContext:
     out_dir: Path | None
     jobs: int | None
     tree_enabled: bool
+    entropy_enabled: bool = False
     analysis: ProgramAnalysis | None = None
     root: Path | None = None
     raw_dir: Path | None = None
@@ -146,6 +147,7 @@ def export_binary(
     progress: Progress | None = None,
     jobs: int | None = None,
     tree: bool = False,
+    entropy: bool = False,
 ) -> ExportSummary:
     progress = progress or analyzer.progress
     context = ExportContext(
@@ -154,6 +156,7 @@ def export_binary(
         out_dir=out_dir,
         jobs=jobs,
         tree_enabled=tree,
+        entropy_enabled=entropy,
     )
     _prepare_tree(context)
     _cluster(context)
@@ -244,6 +247,13 @@ def _select_render_workers(context: ExportContext) -> None:
             else None,
         )
         context.render_mode = "process" if context.worker_count > 1 else "single"
+        if is_ida and context.jobs is not None and context.worker_count < context.jobs:
+            context.progress.log(
+                f"Note: limiting to {context.worker_count} worker(s) instead of the "
+                f"requested {context.jobs} to fit available memory "
+                f"(each IDA worker loads the whole database; override with "
+                f"TOCODE_IDA_WORKER_MEMORY_MB)."
+            )
         context.progress.log(
             describe_jobs(
                 function_count=count,
@@ -871,6 +881,8 @@ def build_tree_cluster_file(
                 else raw_resolved.with_suffix(".asm"),
                 asm_line_start=raw_range.asm_line_start if raw_range is not None else 1,
                 asm_line_end=raw_range.asm_line_end if raw_range is not None else 1,
+                arg_count=raw_range.arg_count if raw_range is not None else None,
+                local_count=raw_range.local_count if raw_range is not None else None,
             )
         )
         c_line = end + 2
@@ -937,6 +949,7 @@ def build_cluster_files(
             _summary_function(routine, summary_path, item.summary_text).rstrip()
             + "\n\n"
         )
+        arg_count, local_count = _counts_from_summary(item.summary_text)
         ranges.append(
             FunctionRange(
                 address=address,
@@ -947,6 +960,8 @@ def build_cluster_files(
                 asm_file=asm_resolved,
                 asm_line_start=asm_start,
                 asm_line_end=asm_end,
+                arg_count=arg_count,
+                local_count=local_count,
             )
         )
         c_line = c_end + 2
@@ -961,6 +976,35 @@ def build_cluster_files(
     }
 
 
+def _counts_from_summary(summary_text: str) -> tuple[int | None, int | None]:
+    """Recover the argument and local counts the backend reported in a summary.
+
+    The decompiler computes these while rendering, so reading them back from the
+    summary avoids re-deriving them (which, for IDA, would mean decompiling every
+    function a second time during inventory). Returns ``(None, None)`` when the
+    summary does not carry the fields, so callers can fall back to inventory data.
+    """
+    args: int | None = None
+    locals_: int | None = None
+    for line in summary_text.splitlines():
+        key, sep, value = line.partition(":")
+        if not sep:
+            continue
+        label = key.strip()
+        if label == "args":
+            args = _safe_int(value)
+        elif label == "locals":
+            locals_ = _safe_int(value)
+    return args, locals_
+
+
+def _safe_int(value: str) -> int | None:
+    try:
+        return int(value.strip())
+    except (TypeError, ValueError):
+        return None
+
+
 def render_functions(
     *,
     analyzer: BinaryAnalyzer,
@@ -1450,7 +1494,7 @@ def write_header() -> None:
 
     def write_variables() -> None:
         context.data_variable_count = export_variables(
-            analysis, root, context.raw_ranges
+            analysis, root, entropy=context.entropy_enabled
         )
 
     def write_indexes() -> None:
@@ -1484,7 +1528,11 @@ def write_triage() -> None:
         write_json(
             root / "triage.json",
             triage_json(
-                analysis, context.clusters, context.raw_ranges, shared["reachable"]
+                analysis,
+                context.clusters,
+                context.raw_ranges,
+                shared["reachable"],
+                entropy=context.entropy_enabled,
             ),
         )
 
@@ -1506,13 +1554,14 @@ def write_docs() -> None:
         ("function index", write_indexes),
         (
             "sections.json",
-            lambda: write_json(root / "sections.json", sections_json(analysis)),
+            lambda: write_json(
+                root / "sections.json",
+                sections_json(analysis, entropy=context.entropy_enabled),
+            ),
         ),
         (
             "strings.json",
-            lambda: write_json(
-                root / "strings.json", strings_json(analysis, context.raw_ranges)
-            ),
+            lambda: write_json(root / "strings.json", strings_json(analysis)),
         ),
         (
             "imports.json",