From 43463cc2cfa2b72a96404968631b3ad8032616ca Mon Sep 17 00:00:00 2001
From: PavanSiligam <pavan.siligam@gmail.com>
Date: Wed, 13 May 2026 02:00:58 +0200
Subject: [PATCH 1/2] Add configurable checksum types and upgrade imohash
 sample size

- Upgrade imohash default sample size from 16 KB to 64 KB (prefix: imohash-64k)
  to reduce false-negative risk in large file comparisons
- Add --checksum-type option to checksums command supporting imohash-64k
  (default), md5, and xxhash
- Guard compare/summary against mixing CSVs with different checksum types,
  raising a clear error to prevent silent mismatches
- Add xxhash to dependencies (setup.py, environment.yaml)
- Add tests documenting imohash false-negative behaviour

Closes #4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 environment.yaml                      |  1 +
 ptool/analyse.py                      | 22 +++++--
 ptool/checksums.py                    | 83 +++++++++++++++++++++------
 ptool/cli.py                          | 17 ++++--
 setup.py                              |  4 ++
 tests/__init__.py                     |  0
 tests/test_imohash_false_negatives.py | 68 ++++++++++++++++++++++
 7 files changed, 168 insertions(+), 27 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_imohash_false_negatives.py

diff --git a/environment.yaml b/environment.yaml
index bcac994..3829fd7 100644
--- a/environment.yaml
+++ b/environment.yaml
@@ -20,3 +20,4 @@ dependencies:
   - pip:
     - imohash
     - tqdm
+    - xxhash
diff --git a/ptool/analyse.py b/ptool/analyse.py
index e153c51..08b1791 100644
--- a/ptool/analyse.py
+++ b/ptool/analyse.py
@@ -1,10 +1,12 @@
-import os
 import itertools
-import pandas as pd
-import numpy as np
-import humanize
+import os
 from collections import defaultdict
 
+import click
+import humanize
+import numpy as np
+import pandas as pd
+
 __all__ = [
     "read_csv",
     "compare",
@@ -31,6 +33,7 @@ def read_csv(filename, ignore=None, drop_duplicates=False):
     if ignore:
         df = df[~df.rparent.str.contains(ignore)]
         df = df[~df.fname.str.contains(ignore)]
+    df["checksum_type"] = df.checksum.str.split(":").str[0]
     df = df.sort_values(by=["checksum", "mtime"])
     dups = df[
         df.duplicated(subset=["checksum", "fname"]).values
@@ -74,7 +77,18 @@ def directory_map(m):
     return (m[["rparent_left", "rparent_right"]]).drop_duplicates()
 
 
+def _assert_compatible_checksums(left, right):
+    lt = left.checksum_type.iloc[0]
+    rt = right.checksum_type.iloc[0]
+    if lt != rt:
+        raise click.UsageError(
+            f"Checksum type mismatch: left CSV uses '{lt}' but right CSV uses '{rt}'. "
+            "Re-generate both snapshots with the same --checksum-type before comparing."
+        )
+
+
 def compare(left, right, relabel=False, threshold=0.1):
+    _assert_compatible_checksums(left, right)
     by_hash = merge(left, right)
     by_name = merge(left, right, on="fname")
     by_hash["flag"] = ""
diff --git a/ptool/checksums.py b/ptool/checksums.py
index f2d19a4..e9e50c6 100755
--- a/ptool/checksums.py
+++ b/ptool/checksums.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import fnmatch
+import hashlib
 import os
 import re
 import sys
@@ -12,6 +13,8 @@
 from imohash import hashfile
 from tqdm.contrib.concurrent import process_map
 
+IMOHASH_SAMPLE_SIZE = 64 * 1024  # upgraded from 16 KB to reduce false-negative risk
+
 not_hidden_files_or_dirs = re.compile(r"^[^.]").match
 
 
@@ -117,21 +120,52 @@ def result(self):
         return self.value
 
 
-def hasher(filename):
-    "Calucates imohash for a given file"
-    return f"imohash:{hashfile(filename, hexdigest=True)}"
+def _imohash(filename):
+    return f"imohash-64k:{hashfile(filename, sample_size=IMOHASH_SAMPLE_SIZE, hexdigest=True)}"
+
+
+def _md5(filename):
+    h = hashlib.md5()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
+            h.update(chunk)
+    return f"md5:{h.hexdigest()}"
+
+
+def _xxhash(filename):
+    import xxhash
+    h = xxhash.xxh3_64()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""):
+            h.update(chunk)
+    return f"xxhash:{h.hexdigest()}"
+
+
+HASHERS = {
+    "imohash-64k": _imohash,
+    "md5": _md5,
+    "xxhash": _xxhash,
+}
 
 
-def stats(fpath, stat=os.stat):
-    "Generates record with imohash and file stats information"
-    try:
-        checksum = hasher(fpath)
-        st = stat(fpath)
-        record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
-        record = Results(value=record)
-    except Exception as e:
-        record = Results(exc=f"{str(e)}")
-    return record
+def make_stats(checksum_type):
+    "Returns a stats function bound to the given checksum type"
+    hasher = HASHERS[checksum_type]
+
+    def stats(fpath, stat=os.stat):
+        try:
+            checksum = hasher(fpath)
+            st = stat(fpath)
+            record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}"
+            return Results(value=record)
+        except Exception as e:
+            return Results(exc=f"{str(e)}")
+
+    return stats
+
+
+# default stats function for backwards-compatible use
+stats = make_stats("imohash-64k")
 
 
 def scanner(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
@@ -178,8 +212,8 @@ def get_files(path, ignore=None, ignore_dirs=None, drop_hidden_files=True):
     return list(files_iter)
 
 
-def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
-    "Calculates hashs of all the files in parallel"
+def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True, checksum_type="imohash-64k"):
+    "Calculates hashes of all the files in parallel"
     echo("Gathering files...")
     with timethis("getting files"):
         if os.path.isdir(path):
@@ -194,11 +228,12 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
     nfiles = len(files)
     echo(f"nfiles: {nfiles}")
     results = ["checksum,fsize,mtime,fpath"]
-    echo("Calculating hashes...")
+    echo(f"Calculating hashes ({checksum_type})...")
     errors = []
+    stats_fn = make_stats(checksum_type)
     with timethis("calculating hashes"):
         futures = process_map(
-            stats, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
+            stats_fn, files, chunksize=10, max_workers=os.cpu_count(), unit="files"
         )
         for item in futures:
             if item.has_error():
@@ -230,11 +265,20 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True):
 @click.option(
     "-o", "--outfile", type=click.File("w"), default="-", help="output filename"
 )
+@click.option(
+    "--checksum-type",
+    type=click.Choice(list(HASHERS)),
+    default="imohash-64k",
+    show_default=True,
+    help="checksum algorithm to use. imohash-64k is fast (samples 3×64KB) but "
+         "may miss changes in unsampled regions of large files. xxhash and md5 "
+         "read the full file and are collision-free but slower on large files.",
+)
 @click.argument("path")
-def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
+def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
     """path to file or folder.
 
-    Calculates imohash checksum of file(s) at the given path.
+    Calculates checksum of file(s) at the given path.
     Results are presented as csv.
     """
     path = os.path.expanduser(path)
@@ -244,6 +288,7 @@ def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files):
         ignore=ignore,
         ignore_dirs=ignore_dirs,
         drop_hidden_files=drop_hidden_files,
+        checksum_type=checksum_type,
     )
 
 
diff --git a/ptool/cli.py b/ptool/cli.py
index f17ea42..a1ade08 100644
--- a/ptool/cli.py
+++ b/ptool/cli.py
@@ -285,19 +285,28 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r
 @click.option(
     "-o", "--outfile", type=click.File("w"), default="-", help="output filename"
 )
+@click.option(
+    "--checksum-type",
+    type=click.Choice(["imohash-64k", "md5", "xxhash"]),
+    default="imohash-64k",
+    show_default=True,
+    help="checksum algorithm. imohash-64k is fast but samples only 3×64KB of "
+         "large files. xxhash and md5 read files fully and are collision-free "
+         "but slower. Both CSVs being compared must use the same algorithm.",
+)
 @click.argument("path")
-def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files):
-    """Calculates imohash checksum of file(s) at the given path.
+def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type):
+    """Calculates checksum of file(s) at the given path.
     Results are presented as csv.
 
     `--ignore` and `--ignore-dirs` support *wildcards* in filtering down the
     matches.  If no *wildcards* are provided, then it performs a literal
     match. For multiple patterns, use comma separation.
     """
-    from . import checksums
+    from . import checksums as cs
 
     path = os.path.expanduser(path)
-    checksums.main(path, outfile, ignore, ignore_dirs, drop_hidden_files)
+    cs.main(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type)
 
 
 if __name__ == "__main__":
diff --git a/setup.py b/setup.py
index 46f6131..15ba215 100644
--- a/setup.py
+++ b/setup.py
@@ -23,7 +23,11 @@
         "pyarrow",
         "imohash",
         "tqdm",
+        "xxhash",
     ],
+    extras_require={
+        "dev": ["pytest"],
+    },
     entry_points="""
         [console_scripts]
         ptool=ptool.cli:cli
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_imohash_false_negatives.py b/tests/test_imohash_false_negatives.py
new file mode 100644
index 0000000..4dbf2c7
--- /dev/null
+++ b/tests/test_imohash_false_negatives.py
@@ -0,0 +1,68 @@
+import hashlib
+
+from imohash import hashfile
+
+SAMPLE_THRESHOLD = 128 * 1024
+SAMPLE_SIZE = 16 * 1024
+
+
+def _md5(path) -> str:
+    return hashlib.md5(path.read_bytes()).hexdigest()
+
+
+class TestImohashFalseNegatives:
+
+    def test_false_negative_large_files(self, tmp_path):
+        """Two large files identical in sampled regions but different in the
+        unsampled middle produce the same imohash — demonstrating the known
+        false-negative risk for files >= SAMPLE_THRESHOLD."""
+        size = 512 * 1024  # 512 KB, well above SAMPLE_THRESHOLD
+
+        base = bytearray(size)
+
+        # Sampled windows for a 512 KB file:
+        #   first:  [0 : 16 KB]
+        #   middle: [256 KB : 272 KB]
+        #   last:   [496 KB : 512 KB]
+        # Byte at 17 KB sits safely in the unsampled gap.
+        unsampled_offset = SAMPLE_SIZE + 1024  # 17 KB
+
+        variant = bytearray(base)
+        variant[unsampled_offset] = 0xFF
+
+        file_a = tmp_path / "file_a.bin"
+        file_b = tmp_path / "file_b.bin"
+        file_a.write_bytes(bytes(base))
+        file_b.write_bytes(bytes(variant))
+
+        # imohash cannot distinguish them — false negative
+        assert hashfile(str(file_a), hexdigest=True) == hashfile(str(file_b), hexdigest=True)
+
+        # MD5 correctly identifies the difference
+        assert _md5(file_a) != _md5(file_b)
+
+    def test_no_false_negative_small_files(self, tmp_path):
+        """Files below SAMPLE_THRESHOLD are hashed in full, so any difference
+        is always detected."""
+        size = SAMPLE_THRESHOLD - 1
+
+        base = bytearray(size)
+        variant = bytearray(base)
+        variant[100] = 0xFF
+
+        file_a = tmp_path / "small_a.bin"
+        file_b = tmp_path / "small_b.bin"
+        file_a.write_bytes(bytes(base))
+        file_b.write_bytes(bytes(variant))
+
+        assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True)
+
+    def test_different_sizes_never_collide(self, tmp_path):
+        """imohash encodes file size in the digest, so files of different sizes
+        always produce different hashes regardless of content."""
+        file_a = tmp_path / "size_a.bin"
+        file_b = tmp_path / "size_b.bin"
+        file_a.write_bytes(b"\x00" * (512 * 1024))
+        file_b.write_bytes(b"\x00" * (512 * 1024 + 1))
+
+        assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True)

From 6f919a8ed7abb2ccac5973d9d5cfbcf20dac2940 Mon Sep 17 00:00:00 2001
From: PavanSiligam <pavan.siligam@gmail.com>
Date: Wed, 13 May 2026 02:03:15 +0200
Subject: [PATCH 2/2] Add tests for new checksum types and fix str[pyarrow]
 dtype bug
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add test_checksum_types.py covering:
  - correct prefix embedding for imohash-64k, md5, xxhash
  - md5 and xxhash catching differences that imohash-64k misses
  - all hashers agreeing on truly identical files
  - compare() raising UsageError on checksum type mismatch
  - compare() passing when both CSVs use the same type
- Fix pre-existing bug in read_csv: "str[pyarrow]" → "string[pyarrow]"
  (invalid dtype string in pandas 2.x, exposed by the new tests)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 ptool/analyse.py             |   2 +-
 tests/test_checksum_types.py | 124 +++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_checksum_types.py

diff --git a/ptool/analyse.py b/ptool/analyse.py
index 08b1791..f44971e 100644
--- a/ptool/analyse.py
+++ b/ptool/analyse.py
@@ -29,7 +29,7 @@ def read_csv(filename, ignore=None, drop_duplicates=False):
     df["rparent"] = df.rpath.apply(os.path.dirname)
     for name, dtype in df.dtypes.items():
         if dtype == "object":
-            df[name] = df[name].astype("str[pyarrow]")
+            df[name] = df[name].astype("string[pyarrow]")
     if ignore:
         df = df[~df.rparent.str.contains(ignore)]
         df = df[~df.fname.str.contains(ignore)]
diff --git a/tests/test_checksum_types.py b/tests/test_checksum_types.py
new file mode 100644
index 0000000..b2dc424
--- /dev/null
+++ b/tests/test_checksum_types.py
@@ -0,0 +1,124 @@
+import pytest
+
+from ptool.checksums import make_stats, HASHERS, IMOHASH_SAMPLE_SIZE
+
+
+def _write(path, content=b"\x00" * 1024):
+    path.write_bytes(content)
+    return path
+
+
+class TestHasherPrefixes:
+    """Each hasher must embed its type name as a prefix in the output."""
+
+    def test_imohash_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("imohash-64k")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("imohash-64k:")
+
+    def test_md5_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("md5")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("md5:")
+
+    def test_xxhash_prefix(self, tmp_path):
+        f = _write(tmp_path / "f.bin")
+        result = make_stats("xxhash")(str(f))
+        assert not result.has_error()
+        assert result.value.startswith("xxhash:")
+
+
+class TestHasherCorrectness:
+    """Full-file hashers (md5, xxhash) must distinguish files that imohash misses."""
+
+    def _make_false_negative_pair(self, tmp_path):
+        """Two large files identical in imohash sample windows but different in between."""
+        size = 512 * 1024
+        base = bytearray(size)
+        variant = bytearray(base)
+        # Byte at 17 KB — safely between first (0–64 KB) and middle (256–320 KB) windows
+        variant[IMOHASH_SAMPLE_SIZE + 1024] = 0xFF
+        file_a = tmp_path / "a.bin"
+        file_b = tmp_path / "b.bin"
+        file_a.write_bytes(bytes(base))
+        file_b.write_bytes(bytes(variant))
+        return file_a, file_b
+
+    def test_imohash_misses_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("imohash-64k")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("imohash-64k")(str(file_b)).value.split(",")[0]
+        assert hash_a == hash_b
+
+    def test_md5_catches_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("md5")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("md5")(str(file_b)).value.split(",")[0]
+        assert hash_a != hash_b
+
+    def test_xxhash_catches_unsampled_difference(self, tmp_path):
+        file_a, file_b = self._make_false_negative_pair(tmp_path)
+        hash_a = make_stats("xxhash")(str(file_a)).value.split(",")[0]
+        hash_b = make_stats("xxhash")(str(file_b)).value.split(",")[0]
+        assert hash_a != hash_b
+
+    def test_identical_files_agree_across_all_types(self, tmp_path):
+        """All hashers should agree that truly identical files have the same hash."""
+        content = b"\xAB" * (512 * 1024)
+        file_a = tmp_path / "a.bin"
+        file_b = tmp_path / "b.bin"
+        file_a.write_bytes(content)
+        file_b.write_bytes(content)
+        for checksum_type in HASHERS:
+            hash_a = make_stats(checksum_type)(str(file_a)).value.split(",")[0]
+            hash_b = make_stats(checksum_type)(str(file_b)).value.split(",")[0]
+            assert hash_a == hash_b, f"{checksum_type} disagreed on identical files"
+
+
+class TestChecksumTypeMismatchGuard:
+    """compare() must reject CSVs generated with different checksum types."""
+
+    def _make_csv(self, tmp_path, checksum_type, filename="snapshot.csv"):
+        f = tmp_path / "data.bin"
+        f.write_bytes(b"\x00" * 1024)
+        record = make_stats(checksum_type)(str(f)).value
+        csv_path = tmp_path / filename
+        csv_path.write_text(f"checksum,fsize,mtime,fpath\n{record}\n")
+        return str(csv_path)
+
+    def test_mismatched_types_raise_error(self, tmp_path):
+        import click
+        from ptool.analyse import read_csv, compare
+
+        left_dir = tmp_path / "left"
+        right_dir = tmp_path / "right"
+        left_dir.mkdir()
+        right_dir.mkdir()
+
+        left_csv = self._make_csv(left_dir, "imohash-64k")
+        right_csv = self._make_csv(right_dir, "md5")
+
+        left, _ = read_csv(left_csv)
+        right, _ = read_csv(right_csv)
+
+        with pytest.raises(click.UsageError, match="Checksum type mismatch"):
+            compare(left, right)
+
+    def test_matching_types_do_not_raise(self, tmp_path):
+        from ptool.analyse import read_csv, compare
+
+        left_dir = tmp_path / "left"
+        right_dir = tmp_path / "right"
+        left_dir.mkdir()
+        right_dir.mkdir()
+
+        left_csv = self._make_csv(left_dir, "xxhash")
+        right_csv = self._make_csv(right_dir, "xxhash")
+
+        left, _ = read_csv(left_csv)
+        right, _ = read_csv(right_csv)
+
+        # should not raise
+        compare(left, right)