From 43463cc2cfa2b72a96404968631b3ad8032616ca Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 02:00:58 +0200 Subject: [PATCH 1/2] Add configurable checksum types and upgrade imohash sample size - Upgrade imohash default sample size from 16 KB to 64 KB (prefix: imohash-64k) to reduce false-negative risk in large file comparisons - Add --checksum-type option to checksums command supporting imohash-64k (default), md5, and xxhash - Guard compare/summary against mixing CSVs with different checksum types, raising a clear error to prevent silent mismatches - Add xxhash to dependencies (setup.py, environment.yaml) - Add tests documenting imohash false-negative behaviour Closes #4 Co-Authored-By: Claude Sonnet 4.6 --- environment.yaml | 1 + ptool/analyse.py | 22 +++++-- ptool/checksums.py | 83 +++++++++++++++++++++------ ptool/cli.py | 17 ++++-- setup.py | 4 ++ tests/__init__.py | 0 tests/test_imohash_false_negatives.py | 68 ++++++++++++++++++++++ 7 files changed, 168 insertions(+), 27 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_imohash_false_negatives.py diff --git a/environment.yaml b/environment.yaml index bcac994..3829fd7 100644 --- a/environment.yaml +++ b/environment.yaml @@ -20,3 +20,4 @@ dependencies: - pip: - imohash - tqdm + - xxhash diff --git a/ptool/analyse.py b/ptool/analyse.py index e153c51..08b1791 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -1,10 +1,12 @@ -import os import itertools -import pandas as pd -import numpy as np -import humanize +import os from collections import defaultdict +import click +import humanize +import numpy as np +import pandas as pd + __all__ = [ "read_csv", "compare", @@ -31,6 +33,7 @@ def read_csv(filename, ignore=None, drop_duplicates=False): if ignore: df = df[~df.rparent.str.contains(ignore)] df = df[~df.fname.str.contains(ignore)] + df["checksum_type"] = df.checksum.str.split(":").str[0] df = df.sort_values(by=["checksum", "mtime"]) dups = df[ df.duplicated(subset=["checksum", "fname"]).values @@ -74,7 +77,18 @@ def directory_map(m): return (m[["rparent_left", "rparent_right"]]).drop_duplicates() +def _assert_compatible_checksums(left, right): + lt = left.checksum_type.iloc[0] + rt = right.checksum_type.iloc[0] + if lt != rt: + raise click.UsageError( + f"Checksum type mismatch: left CSV uses '{lt}' but right CSV uses '{rt}'. " + "Re-generate both snapshots with the same --checksum-type before comparing." + ) + + def compare(left, right, relabel=False, threshold=0.1): + _assert_compatible_checksums(left, right) by_hash = merge(left, right) by_name = merge(left, right, on="fname") by_hash["flag"] = "" diff --git a/ptool/checksums.py b/ptool/checksums.py index f2d19a4..e9e50c6 100755 --- a/ptool/checksums.py +++ b/ptool/checksums.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import fnmatch +import hashlib import os import re import sys @@ -12,6 +13,8 @@ from imohash import hashfile from tqdm.contrib.concurrent import process_map +IMOHASH_SAMPLE_SIZE = 64 * 1024 # upgraded from 16 KB to reduce false-negative risk + not_hidden_files_or_dirs = re.compile(r"^[^.]").match @@ -117,21 +120,52 @@ def result(self): return self.value -def hasher(filename): - "Calucates imohash for a given file" - return f"imohash:{hashfile(filename, hexdigest=True)}" +def _imohash(filename): + return f"imohash-64k:{hashfile(filename, sample_size=IMOHASH_SAMPLE_SIZE, hexdigest=True)}" + + +def _md5(filename): + h = hashlib.md5() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""): + h.update(chunk) + return f"md5:{h.hexdigest()}" + + +def _xxhash(filename): + import xxhash + h = xxhash.xxh3_64() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(8 * 1024 * 1024), b""): + h.update(chunk) + return f"xxhash:{h.hexdigest()}" + + +HASHERS = { + "imohash-64k": _imohash, + "md5": _md5, + "xxhash": _xxhash, +} -def stats(fpath, stat=os.stat): - "Generates record with imohash and file stats information" - try: - checksum = hasher(fpath) - st = stat(fpath) - record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}" - record = Results(value=record) - except Exception as e: - record = Results(exc=f"{str(e)}") - return record +def make_stats(checksum_type): + "Returns a stats function bound to the given checksum type" + hasher = HASHERS[checksum_type] + + def stats(fpath, stat=os.stat): + try: + checksum = hasher(fpath) + st = stat(fpath) + record = f"{checksum},{st.st_size},{st.st_mtime},{fpath}" + return Results(value=record) + except Exception as e: + return Results(exc=f"{str(e)}") + + return stats + + +# default stats function for backwards-compatible use +stats = make_stats("imohash-64k") def scanner(path, ignore=None, ignore_dirs=None, drop_hidden_files=True): @@ -178,8 +212,8 @@ def get_files(path, ignore=None, ignore_dirs=None, drop_hidden_files=True): return list(files_iter) -def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True): - "Calculates hashs of all the files in parallel" +def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True, checksum_type="imohash-64k"): + "Calculates hashes of all the files in parallel" echo("Gathering files...") with timethis("getting files"): if os.path.isdir(path): @@ -194,11 +228,12 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True): nfiles = len(files) echo(f"nfiles: {nfiles}") results = ["checksum,fsize,mtime,fpath"] - echo("Calculating hashes...") + echo(f"Calculating hashes ({checksum_type})...") errors = [] + stats_fn = make_stats(checksum_type) with timethis("calculating hashes"): futures = process_map( - stats, files, chunksize=10, max_workers=os.cpu_count(), unit="files" + stats_fn, files, chunksize=10, max_workers=os.cpu_count(), unit="files" ) for item in futures: if item.has_error(): @@ -230,11 +265,20 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True): @click.option( "-o", "--outfile", type=click.File("w"), default="-", help="output filename" ) +@click.option( + "--checksum-type", + type=click.Choice(list(HASHERS)), + default="imohash-64k", + show_default=True, + help="checksum algorithm to use. imohash-64k is fast (samples 3×64KB) but " + "may miss changes in unsampled regions of large files. xxhash and md5 " + "read the full file and are collision-free but slower on large files.", +) @click.argument("path") -def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files): +def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type): """path to file or folder. - Calculates imohash checksum of file(s) at the given path. + Calculates checksum of file(s) at the given path. Results are presented as csv. """ path = os.path.expanduser(path) @@ -244,6 +288,7 @@ def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files): ignore=ignore, ignore_dirs=ignore_dirs, drop_hidden_files=drop_hidden_files, + checksum_type=checksum_type, ) diff --git a/ptool/cli.py b/ptool/cli.py index f17ea42..a1ade08 100644 --- a/ptool/cli.py +++ b/ptool/cli.py @@ -285,19 +285,28 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r @click.option( "-o", "--outfile", type=click.File("w"), default="-", help="output filename" ) +@click.option( + "--checksum-type", + type=click.Choice(["imohash-64k", "md5", "xxhash"]), + default="imohash-64k", + show_default=True, + help="checksum algorithm. imohash-64k is fast but samples only 3×64KB of " + "large files. xxhash and md5 read files fully and are collision-free " + "but slower. Both CSVs being compared must use the same algorithm.", +) @click.argument("path") -def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files): - """Calculates imohash checksum of file(s) at the given path. +def checksums(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type): + """Calculates checksum of file(s) at the given path. Results are presented as csv. `--ignore` and `--ignore-dirs` support *wildcards* in filtering down the matches. If no *wildcards* are provided, then it performs a literal match. For multiple patterns, use comma separation. """ - from . import checksums + from . import checksums as cs path = os.path.expanduser(path) - checksums.main(path, outfile, ignore, ignore_dirs, drop_hidden_files) + cs.main(path, outfile, ignore, ignore_dirs, drop_hidden_files, checksum_type) if __name__ == "__main__": diff --git a/setup.py b/setup.py index 46f6131..15ba215 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,11 @@ "pyarrow", "imohash", "tqdm", + "xxhash", ], + extras_require={ + "dev": ["pytest"], + }, entry_points=""" [console_scripts] ptool=ptool.cli:cli diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_imohash_false_negatives.py b/tests/test_imohash_false_negatives.py new file mode 100644 index 0000000..4dbf2c7 --- /dev/null +++ b/tests/test_imohash_false_negatives.py @@ -0,0 +1,68 @@ +import hashlib + +from imohash import hashfile + +SAMPLE_THRESHOLD = 128 * 1024 +SAMPLE_SIZE = 16 * 1024 + + +def _md5(path) -> str: + return hashlib.md5(path.read_bytes()).hexdigest() + + +class TestImohashFalseNegatives: + + def test_false_negative_large_files(self, tmp_path): + """Two large files identical in sampled regions but different in the + unsampled middle produce the same imohash — demonstrating the known + false-negative risk for files >= SAMPLE_THRESHOLD.""" + size = 512 * 1024 # 512 KB, well above SAMPLE_THRESHOLD + + base = bytearray(size) + + # Sampled windows for a 512 KB file: + # first: [0 : 16 KB] + # middle: [256 KB : 272 KB] + # last: [496 KB : 512 KB] + # Byte at 17 KB sits safely in the unsampled gap. + unsampled_offset = SAMPLE_SIZE + 1024 # 17 KB + + variant = bytearray(base) + variant[unsampled_offset] = 0xFF + + file_a = tmp_path / "file_a.bin" + file_b = tmp_path / "file_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + # imohash cannot distinguish them — false negative + assert hashfile(str(file_a), hexdigest=True) == hashfile(str(file_b), hexdigest=True) + + # MD5 correctly identifies the difference + assert _md5(file_a) != _md5(file_b) + + def test_no_false_negative_small_files(self, tmp_path): + """Files below SAMPLE_THRESHOLD are hashed in full, so any difference + is always detected.""" + size = SAMPLE_THRESHOLD - 1 + + base = bytearray(size) + variant = bytearray(base) + variant[100] = 0xFF + + file_a = tmp_path / "small_a.bin" + file_b = tmp_path / "small_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True) + + def test_different_sizes_never_collide(self, tmp_path): + """imohash encodes file size in the digest, so files of different sizes + always produce different hashes regardless of content.""" + file_a = tmp_path / "size_a.bin" + file_b = tmp_path / "size_b.bin" + file_a.write_bytes(b"\x00" * (512 * 1024)) + file_b.write_bytes(b"\x00" * (512 * 1024 + 1)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True) From 6f919a8ed7abb2ccac5973d9d5cfbcf20dac2940 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 02:03:15 +0200 Subject: [PATCH 2/2] Add tests for new checksum types and fix str[pyarrow] dtype bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add test_checksum_types.py covering: - correct prefix embedding for imohash-64k, md5, xxhash - md5 and xxhash catching differences that imohash-64k misses - all hashers agreeing on truly identical files - compare() raising UsageError on checksum type mismatch - compare() passing when both CSVs use the same type - Fix pre-existing bug in read_csv: "str[pyarrow]" → "string[pyarrow]" (invalid dtype string in pandas 2.x, exposed by the new tests) Co-Authored-By: Claude Sonnet 4.6 --- ptool/analyse.py | 2 +- tests/test_checksum_types.py | 124 +++++++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 tests/test_checksum_types.py diff --git a/ptool/analyse.py b/ptool/analyse.py index 08b1791..f44971e 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -29,7 +29,7 @@ def read_csv(filename, ignore=None, drop_duplicates=False): df["rparent"] = df.rpath.apply(os.path.dirname) for name, dtype in df.dtypes.items(): if dtype == "object": - df[name] = df[name].astype("str[pyarrow]") + df[name] = df[name].astype("string[pyarrow]") if ignore: df = df[~df.rparent.str.contains(ignore)] df = df[~df.fname.str.contains(ignore)] diff --git a/tests/test_checksum_types.py b/tests/test_checksum_types.py new file mode 100644 index 0000000..b2dc424 --- /dev/null +++ b/tests/test_checksum_types.py @@ -0,0 +1,124 @@ +import pytest + +from ptool.checksums import make_stats, HASHERS, IMOHASH_SAMPLE_SIZE + + +def _write(path, content=b"\x00" * 1024): + path.write_bytes(content) + return path + + +class TestHasherPrefixes: + """Each hasher must embed its type name as a prefix in the output.""" + + def test_imohash_prefix(self, tmp_path): + f = _write(tmp_path / "f.bin") + result = make_stats("imohash-64k")(str(f)) + assert not result.has_error() + assert result.value.startswith("imohash-64k:") + + def test_md5_prefix(self, tmp_path): + f = _write(tmp_path / "f.bin") + result = make_stats("md5")(str(f)) + assert not result.has_error() + assert result.value.startswith("md5:") + + def test_xxhash_prefix(self, tmp_path): + f = _write(tmp_path / "f.bin") + result = make_stats("xxhash")(str(f)) + assert not result.has_error() + assert result.value.startswith("xxhash:") + + +class TestHasherCorrectness: + """Full-file hashers (md5, xxhash) must distinguish files that imohash misses.""" + + def _make_false_negative_pair(self, tmp_path): + """Two large files identical in imohash sample windows but different in between.""" + size = 512 * 1024 + base = bytearray(size) + variant = bytearray(base) + # Byte at 17 KB — safely between first (0–64 KB) and middle (256–320 KB) windows + variant[IMOHASH_SAMPLE_SIZE + 1024] = 0xFF + file_a = tmp_path / "a.bin" + file_b = tmp_path / "b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + return file_a, file_b + + def test_imohash_misses_unsampled_difference(self, tmp_path): + file_a, file_b = self._make_false_negative_pair(tmp_path) + hash_a = make_stats("imohash-64k")(str(file_a)).value.split(",")[0] + hash_b = make_stats("imohash-64k")(str(file_b)).value.split(",")[0] + assert hash_a == hash_b + + def test_md5_catches_unsampled_difference(self, tmp_path): + file_a, file_b = self._make_false_negative_pair(tmp_path) + hash_a = make_stats("md5")(str(file_a)).value.split(",")[0] + hash_b = make_stats("md5")(str(file_b)).value.split(",")[0] + assert hash_a != hash_b + + def test_xxhash_catches_unsampled_difference(self, tmp_path): + file_a, file_b = self._make_false_negative_pair(tmp_path) + hash_a = make_stats("xxhash")(str(file_a)).value.split(",")[0] + hash_b = make_stats("xxhash")(str(file_b)).value.split(",")[0] + assert hash_a != hash_b + + def test_identical_files_agree_across_all_types(self, tmp_path): + """All hashers should agree that truly identical files have the same hash.""" + content = b"\xAB" * (512 * 1024) + file_a = tmp_path / "a.bin" + file_b = tmp_path / "b.bin" + file_a.write_bytes(content) + file_b.write_bytes(content) + for checksum_type in HASHERS: + hash_a = make_stats(checksum_type)(str(file_a)).value.split(",")[0] + hash_b = make_stats(checksum_type)(str(file_b)).value.split(",")[0] + assert hash_a == hash_b, f"{checksum_type} disagreed on identical files" + + +class TestChecksumTypeMismatchGuard: + """compare() must reject CSVs generated with different checksum types.""" + + def _make_csv(self, tmp_path, checksum_type, filename="snapshot.csv"): + f = tmp_path / "data.bin" + f.write_bytes(b"\x00" * 1024) + record = make_stats(checksum_type)(str(f)).value + csv_path = tmp_path / filename + csv_path.write_text(f"checksum,fsize,mtime,fpath\n{record}\n") + return str(csv_path) + + def test_mismatched_types_raise_error(self, tmp_path): + import click + from ptool.analyse import read_csv, compare + + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + + left_csv = self._make_csv(left_dir, "imohash-64k") + right_csv = self._make_csv(right_dir, "md5") + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + + with pytest.raises(click.UsageError, match="Checksum type mismatch"): + compare(left, right) + + def test_matching_types_do_not_raise(self, tmp_path): + from ptool.analyse import read_csv, compare + + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + + left_csv = self._make_csv(left_dir, "xxhash") + right_csv = self._make_csv(right_dir, "xxhash") + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + + # should not raise + compare(left, right)