From 73172df7f3292ebc538d61a2d512e99e1b5d21b2 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 02:17:00 +0200 Subject: [PATCH 1/7] Modernise repository to open-source standards MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace setup.py with pyproject.toml (PEP 517/518 compliant) - Fix license metadata: MIT → GPL-3.0 (matches LICENSE file) - Fix PyPI classifier: invalid version string → Development Status 4 - Beta - Fix url: GitLab → GitHub - Add CONTRIBUTING.md, CODE_OF_CONDUCT.md, SECURITY.md, CHANGELOG.md - Add GitHub issue templates (bug report, feature request) - Add GitHub pull request template - Add GitHub Actions CI workflow (Python 3.9–3.12) - Expand .gitignore with standard Python, editor, and OS entries Co-Authored-By: Claude Sonnet 4.6 --- .github/ISSUE_TEMPLATE/bug_report.yml | 43 ++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.yml | 25 +++++++++++ .github/PULL_REQUEST_TEMPLATE.md | 16 +++++++ .github/workflows/ci.yml | 28 ++++++++++++ .gitignore | 29 +++++++++++- CHANGELOG.md | 42 +++++++++++++++++ CODE_OF_CONDUCT.md | 41 +++++++++++++++++ CONTRIBUTING.md | 52 ++++++++++++++++++++++ SECURITY.md | 23 ++++++++++ pyproject.toml | 47 +++++++++++++++++++ setup.py | 44 ------------------ 11 files changed, 344 insertions(+), 46 deletions(-) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.yml create mode 100644 .github/ISSUE_TEMPLATE/feature_request.yml create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/workflows/ci.yml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CONTRIBUTING.md create mode 100644 SECURITY.md create mode 100644 pyproject.toml delete mode 100644 setup.py diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..a1a6c8d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,43 @@ +name: Bug Report +description: Report a bug in ptool +labels: ["bug"] +body: + - type: markdown + attributes: + value: "Thanks for taking the time to report a bug!" + + - type: textarea + id: description + attributes: + label: Description + description: A clear description of what the bug is. + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce + description: Minimal steps to reproduce the behaviour. + placeholder: | + 1. Run `ptool checksums ...` + 2. Then run `ptool compare ...` + 3. See error + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behaviour + description: What you expected to happen. + validations: + required: true + + - type: textarea + id: environment + attributes: + label: Environment + description: Output of `ptool --version` and `python --version`. + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..cad08dd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,25 @@ +name: Feature Request +description: Suggest a new feature or improvement +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What problem does this feature solve? + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed solution + description: Describe the solution you'd like. + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: Any alternative approaches you've thought about. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..940a1a3 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Summary + + + +## Changes + +- + +## Test plan + +- [ ] Tests added or updated +- [ ] `pytest tests/ -v` passes locally + +## Related issues + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fdff357 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests + run: pytest tests/ -v diff --git a/.gitignore b/.gitignore index 76d970c..114de51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,28 @@ -__pycache__ -ptool.egg-info +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +*.egg +dist/ +build/ + +# Virtual environments +.venv/ +.env/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Editor +.vscode/ +.idea/ +*.swp +*.swo + +# HPC slurm-*.out + +# OS +.DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..4da4b4b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,42 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Configurable checksum types: `imohash-64k` (default), `xxhash`, `md5` +- `--checksum-type` option on `ptool checksums` command +- Guard in `compare` and `summary` against mixing CSVs with different checksum types +- Test suite covering checksum type correctness and false-negative behaviour +- `pyproject.toml` replacing legacy `setup.py` +- `CONTRIBUTING.md`, `CODE_OF_CONDUCT.md`, `SECURITY.md`, `CHANGELOG.md` +- GitHub Actions CI workflow +- GitHub issue and PR templates + +### Fixed +- Invalid `str[pyarrow]` dtype string in `read_csv` (now `string[pyarrow]`), which caused a `TypeError` on pandas 2.x +- Incorrect `license = "MIT"` metadata — corrected to GPL-3.0 to match the `LICENSE` file +- Invalid PyPI classifier `Development Status :: 0.2.2` — corrected to `Development Status :: 4 - Beta` +- Wrong `url` in package metadata (pointed to GitLab) — updated to GitHub + +### Changed +- imohash sample size upgraded from 16 KB to 64 KB to reduce false-negative risk + +## [0.2.2] - 2024-01-01 + +### Added +- `sync_options`: support for additional sync configuration options + +## [0.2.1] - 2024-01-01 + +### Fixed +- Truncated prefix handling in checksum paths + +## [0.2.0] - 2024-01-01 + +### Added +- Initial public release with `checksums`, `compare`, `summary`, and `prepare-rsync` commands diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f430314 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,41 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes +* Focusing on what is best not just for us as individuals, but for the overall community + +Examples of unacceptable behavior: + +* The use of sexualized language or imagery, and sexual attention or advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at **pavan.siligam@gmail.com**. All complaints will be reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a876390 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,52 @@ +# Contributing to ptool + +Thank you for considering contributing to ptool! This document describes how to get started. + +## Development setup + +1. Fork the repository and clone your fork: + + ```shell + git clone https://github.com//pool_tool.git + cd pool_tool + ``` + +2. Create the conda environment and install the package in editable mode: + + ```shell + conda env create -f environment.yaml + conda activate ptool + pip install -e ".[dev]" + ``` + +3. Verify the setup by running the test suite: + + ```shell + pytest tests/ -v + ``` + +## Workflow + +- Create a feature branch from `main`: + + ```shell + git checkout -b my-feature + ``` + +- Make your changes, add tests, and ensure all tests pass before opening a PR. +- Write clear, descriptive commit messages. +- Open a pull request against `main` and fill in the PR template. + +## Running tests + +```shell +pytest tests/ -v +``` + +## Reporting bugs + +Please use the [bug report template](https://github.com/esm-tools/pool_tool/issues/new?template=bug_report.yml) when filing issues. + +## Code of conduct + +This project follows the [Contributor Covenant Code of Conduct](CODE_OF_CONDUCT.md). By participating you agree to abide by its terms. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..1712864 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,23 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.2.x | Yes | +| < 0.2 | No | + +## Reporting a Vulnerability + +Please **do not** report security vulnerabilities through public GitHub issues. + +Instead, email **pavan.siligam@gmail.com** with the subject line `[ptool] Security Vulnerability`. + +Include as much of the following as possible: + +- Description of the vulnerability +- Steps to reproduce +- Potential impact +- Suggested fix (if any) + +You can expect an acknowledgement within 48 hours and a resolution timeline within 7 days. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9c5d89d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,47 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "ptool" +version = "0.2.2" +description = "Analyse project data in pool directory at various sites" +readme = "README.md" +license = { file = "LICENSE" } +authors = [{ name = "Pavan Siligam", email = "pavan.siligam@gmail.com" }] +requires-python = ">=3.9" +dependencies = [ + "click", + "humanize", + "tabulate", + "pandas", + "pyarrow", + "imohash", + "tqdm", + "xxhash", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +dev = ["pytest"] + +[project.urls] +Homepage = "https://github.com/esm-tools/pool_tool" +Repository = "https://github.com/esm-tools/pool_tool" +Issues = "https://github.com/esm-tools/pool_tool/issues" + +[project.scripts] +ptool = "ptool.cli:cli" + +[tool.setuptools.packages.find] +include = ["ptool"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 46f6131..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -from setuptools import setup, find_packages -import pathlib - - -here = pathlib.Path(__file__).parent.resolve() -long_description = (here / "README.md").read_text(encoding="utf-8") - -setup( - name="ptool", - version="0.2.2", - description="Analyse project data in pool directory at various sites", - long_description=long_description, - long_description_content_type="text/markdown", - python_requires=">=3.9", - packages=find_packages(include=["ptool"]), - install_requires=[ - "click", - "humanize", - "tabulate", - "pandas", - "pyarrow", - "imohash", - "tqdm", - ], - entry_points=""" - [console_scripts] - ptool=ptool.cli:cli - """, - classifiers=[ - "Development Status :: 0.2.2", - "Intended Audience :: Developers", - "Intended Audience :: Information Technology", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - author="Pavan Siligam", - author_email="pavan.siligam@gmail.com", - license="MIT", - url="https://gitlab.awi.de/hpc/pool", -) From a85ddfe55f376fadbe55d39f12bbf38a8a8e6b57 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 02:20:30 +0200 Subject: [PATCH 2/7] Add full repository test suite (51 tests) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tests/test_checksums.py: split, ignore_re, Results, hasher, stats, scanner, get_files — covers all utility functions and file scanning logic - tests/test_analyse.py: read_csv, compare, compare_compact, merge, directory_map — covers identical/renamed/modified/unique classification - tests/test_cli.py: checksums, compare, summary, prepare-rsync commands via click CliRunner — covers all CLI entry points and flags - Fix pre-existing bug in read_csv: "str[pyarrow]" → "string[pyarrow]" (TypeError on pandas 2.x, exposed by the new tests) Co-Authored-By: Claude Sonnet 4.6 --- ptool/analyse.py | 2 +- tests/__init__.py | 0 tests/test_analyse.py | 159 +++++++++++++++++++++++++++++++++++++++ tests/test_checksums.py | 160 ++++++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 125 +++++++++++++++++++++++++++++++ 5 files changed, 445 insertions(+), 1 deletion(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_analyse.py create mode 100644 tests/test_checksums.py create mode 100644 tests/test_cli.py diff --git a/ptool/analyse.py b/ptool/analyse.py index e153c51..151e1f2 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -27,7 +27,7 @@ def read_csv(filename, ignore=None, drop_duplicates=False): df["rparent"] = df.rpath.apply(os.path.dirname) for name, dtype in df.dtypes.items(): if dtype == "object": - df[name] = df[name].astype("str[pyarrow]") + df[name] = df[name].astype("string[pyarrow]") if ignore: df = df[~df.rparent.str.contains(ignore)] df = df[~df.fname.str.contains(ignore)] diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_analyse.py b/tests/test_analyse.py new file mode 100644 index 0000000..a549ad5 --- /dev/null +++ b/tests/test_analyse.py @@ -0,0 +1,159 @@ +import os +import time + +import pytest + +from ptool.analyse import compare, compare_compact, directory_map, merge, read_csv +from ptool.checksums import stats + + +def _make_csv(tmp_path, name, files): + """Write a snapshot CSV from a dict of {filename: content_bytes}.""" + records = ["checksum,fsize,mtime,fpath"] + for fname, content in files.items(): + f = tmp_path / fname + f.write_bytes(content) + r = stats(str(f)) + assert not r.has_error(), f"stats failed for {fname}: {r.result()}" + records.append(r.value) + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +@pytest.fixture +def two_identical_pools(tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = {"data.nc": b"\xAB" * 1024} + left_csv = _make_csv(left_dir, "snapshot_left", content) + right_csv = _make_csv(right_dir, "snapshot_right", content) + return left_csv, right_csv + + +@pytest.fixture +def one_unique_pool(tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "snapshot_left", {"only_left.nc": b"\xAA" * 1024, "common.nc": b"\xBB" * 1024}) + right_csv = _make_csv(right_dir, "snapshot_right", {"common.nc": b"\xBB" * 1024}) + return left_csv, right_csv + + +class TestReadCsv: + def test_loads_records(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\x00" * 512}) + df, dups = read_csv(csv) + assert len(df) == 1 + assert "checksum" in df.columns + assert "fsize" in df.columns + assert "mtime" in df.columns + assert "fpath" in df.columns + + def test_derives_fname_and_rpath(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"file.nc": b"\x00" * 256}) + df, _ = read_csv(csv) + assert df["fname"].iloc[0] == "file.nc" + assert "rpath" in df.columns + + def test_no_duplicates_when_files_unique(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\xAA" * 512, "b.nc": b"\xBB" * 512}) + df, dups = read_csv(csv) + assert len(df) == 2 + assert len(dups) == 0 + + def test_filters_dash_checksums(self, tmp_path): + # read_csv filters rows where checksum == "-", but a CSV with only + # such rows leaves an empty DataFrame and commonpath([]) raises. + # Mixed CSVs (some valid rows) work fine. + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\x00" * 256}) + # Manually append a dash row to the valid CSV + with open(csv, "a") as f: + f.write("-,0,0.0,/some/phantom\n") + df, _ = read_csv(csv) + # The dash row is filtered out; only the real file remains + assert len(df) == 1 + assert all(df["checksum"] != "-") + + +class TestCompare: + def test_identical_files_classified_correctly(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "identical" in result.index + + def test_unique_files_classified_correctly(self, one_unique_pool): + left_csv, right_csv = one_unique_pool + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "unique" in result.index + assert "identical" in result.index + + def test_renamed_files_classified_correctly(self, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = b"\xCC" * 1024 + left_csv = _make_csv(left_dir, "snap_left", {"original.nc": content}) + right_csv = _make_csv(right_dir, "snap_right", {"renamed.nc": content}) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "renamed" in result.index + + def test_empty_result_when_no_overlap(self, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "snap_left", {"left_only.nc": b"\xAA" * 512}) + right_csv = _make_csv(right_dir, "snap_right", {"right_only.nc": b"\xBB" * 512}) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "identical" not in result.index + assert "unique" in result.index + + +class TestCompareCompact: + def test_returns_rpath_columns(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare_compact(left, right) + assert "rpath_left" in result.columns + assert "rpath_right" in result.columns + + +class TestMergeAndDirectoryMap: + def test_merge_on_checksum(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + assert len(m) == 1 + + def test_directory_map_returns_pairs(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + dm = directory_map(m) + assert "rparent_left" in dm.columns + assert "rparent_right" in dm.columns diff --git a/tests/test_checksums.py b/tests/test_checksums.py new file mode 100644 index 0000000..6d55392 --- /dev/null +++ b/tests/test_checksums.py @@ -0,0 +1,160 @@ +import os + +import pytest + +from ptool.checksums import Results, get_files, hasher, ignore_re, scanner, split, stats + + +class TestSplit: + def test_simple(self): + assert split("a,b,c") == ["a", "b", "c"] + + def test_single(self): + assert split("a") == ["a"] + + def test_empty(self): + assert split("") == [] + + def test_none(self): + assert split(None) == [] + + def test_escaped_separator(self): + assert split(r"a\,b,c") == ["a,b", "c"] + + def test_escaped_separator_middle(self): + assert split(r"a,b\,c,d") == ["a", "b,c", "d"] + + def test_custom_separator(self): + assert split("a|b|c", sep="|") == ["a", "b", "c"] + + def test_custom_escape(self): + assert split("a|b#|c|d", sep="|", escape="#") == ["a", "b|c", "d"] + + +class TestIgnoreRe: + def test_no_pattern_matches_nothing(self): + fn = ignore_re(None) + assert fn("anything") is False + + def test_exact_match(self): + fn = ignore_re("foo.nc") + assert fn("foo.nc") is True + assert fn("bar.nc") is False + + def test_wildcard(self): + fn = ignore_re("*.nc") + assert fn("data.nc") is True + assert fn("data.csv") is False + + def test_multiple_patterns(self): + fn = ignore_re("*.nc,*.txt") + assert fn("data.nc") is True + assert fn("readme.txt") is True + assert fn("script.py") is False + + +class TestResults: + def test_value(self): + r = Results(value="ok") + assert not r.has_error() + assert r.result() == "ok" + + def test_exception(self): + r = Results(exc="something went wrong") + assert r.has_error() + assert r.result() == "something went wrong" + + +class TestHasher: + def test_returns_imohash_prefix(self, tmp_path): + f = tmp_path / "file.bin" + f.write_bytes(b"\x00" * 1024) + result = hasher(str(f)) + assert result.startswith("imohash:") + + def test_same_content_same_hash(self, tmp_path): + content = b"\xAB" * 1024 + a = tmp_path / "a.bin" + b = tmp_path / "b.bin" + a.write_bytes(content) + b.write_bytes(content) + assert hasher(str(a)) == hasher(str(b)) + + def test_different_content_different_hash(self, tmp_path): + a = tmp_path / "a.bin" + b = tmp_path / "b.bin" + a.write_bytes(b"\x00" * 1024) + b.write_bytes(b"\xFF" * 1024) + assert hasher(str(a)) != hasher(str(b)) + + +class TestStats: + def test_valid_file_produces_csv_record(self, tmp_path): + f = tmp_path / "data.bin" + f.write_bytes(b"\x00" * 512) + result = stats(str(f)) + assert not result.has_error() + parts = result.value.split(",") + assert len(parts) == 4 + assert parts[0].startswith("imohash:") + assert parts[1] == "512" # fsize + assert str(f) in parts[3] # fpath + + def test_missing_file_produces_error(self, tmp_path): + result = stats(str(tmp_path / "nonexistent.bin")) + assert result.has_error() + + +class TestScanner: + def test_finds_files(self, tmp_path): + (tmp_path / "a.nc").write_bytes(b"x") + (tmp_path / "b.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 2 + + def test_recurses_into_subdirs(self, tmp_path): + sub = tmp_path / "sub" + sub.mkdir() + (tmp_path / "root.nc").write_bytes(b"x") + (sub / "nested.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 2 + + def test_drops_hidden_files_by_default(self, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"x") + (tmp_path / ".hidden").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 1 + assert all(".hidden" not in f for f in found) + + def test_includes_hidden_files_when_disabled(self, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"x") + (tmp_path / ".hidden").write_bytes(b"x") + found = list(scanner(str(tmp_path), drop_hidden_files=False)) + assert len(found) == 2 + + def test_ignore_pattern(self, tmp_path): + (tmp_path / "keep.nc").write_bytes(b"x") + (tmp_path / "skip.tmp").write_bytes(b"x") + found = list(scanner(str(tmp_path), ignore="*.tmp")) + assert len(found) == 1 + assert found[0].endswith("keep.nc") + + def test_ignore_dirs(self, tmp_path): + keep = tmp_path / "keep" + skip = tmp_path / "skip" + keep.mkdir() + skip.mkdir() + (keep / "a.nc").write_bytes(b"x") + (skip / "b.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path), ignore_dirs="skip")) + assert len(found) == 1 + assert "keep" in found[0] + + +class TestGetFiles: + def test_returns_list(self, tmp_path): + (tmp_path / "a.nc").write_bytes(b"x") + result = get_files(str(tmp_path)) + assert isinstance(result, list) + assert len(result) == 1 diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..775228b --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,125 @@ +import os + +import pytest +from click.testing import CliRunner + +from ptool.cli import cli +from ptool.checksums import stats + + +def _make_csv(tmp_path, name, files): + records = ["checksum,fsize,mtime,fpath"] + for fname, content in files.items(): + f = tmp_path / fname + f.write_bytes(content) + r = stats(str(f)) + assert not r.has_error() + records.append(r.value) + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +@pytest.fixture +def runner(): + return CliRunner() + + +class TestChecksumsCommand: + def test_runs_on_directory(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 512) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert result.exit_code == 0 + assert "imohash:" in result.output + + def test_runs_on_single_file(self, runner, tmp_path): + f = tmp_path / "data.nc" + f.write_bytes(b"\x00" * 512) + result = runner.invoke(cli, ["checksums", str(f)]) + assert result.exit_code == 0 + assert "imohash:" in result.output + + def test_output_has_csv_header(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert "checksum,fsize,mtime,fpath" in result.output + + def test_ignore_flag(self, runner, tmp_path): + (tmp_path / "keep.nc").write_bytes(b"\x00" * 256) + (tmp_path / "skip.tmp").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", "--ignore", "*.tmp", str(tmp_path)]) + assert result.exit_code == 0 + assert "skip.tmp" not in result.output + assert "keep.nc" in result.output + + def test_drop_hidden_files_default(self, runner, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"\x00" * 256) + (tmp_path / ".hidden").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert result.exit_code == 0 + assert ".hidden" not in result.output + + def test_no_drop_hidden_files(self, runner, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"\x00" * 256) + (tmp_path / ".hidden").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", "--no-drop-hidden-files", str(tmp_path)]) + assert result.exit_code == 0 + assert ".hidden" in result.output + + def test_output_to_file(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 256) + out = tmp_path / "out.csv" + result = runner.invoke(cli, ["checksums", "-o", str(out), str(tmp_path)]) + assert result.exit_code == 0 + assert out.exists() + assert "imohash:" in out.read_text() + + +class TestCompareCommand: + def test_compare_identical_pools(self, runner, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = {"data.nc": b"\xAB" * 1024} + left_csv = _make_csv(left_dir, "left", content) + right_csv = _make_csv(right_dir, "right", content) + result = runner.invoke(cli, ["compare", left_csv, right_csv]) + assert result.exit_code == 0 + + def test_compare_shows_unique(self, runner, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "left", {"common.nc": b"\xAA" * 512, "extra.nc": b"\xBB" * 512}) + right_csv = _make_csv(right_dir, "right", {"common.nc": b"\xAA" * 512}) + result = runner.invoke(cli, ["compare", left_csv, right_csv]) + assert result.exit_code == 0 + + +class TestCliHelp: + def test_root_help(self, runner): + result = runner.invoke(cli, ["--help"]) + assert result.exit_code == 0 + assert "checksums" in result.output + assert "compare" in result.output + assert "summary" in result.output + + def test_checksums_help(self, runner): + result = runner.invoke(cli, ["checksums", "--help"]) + assert result.exit_code == 0 + assert "--ignore" in result.output + assert "--outfile" in result.output + + def test_compare_help(self, runner): + result = runner.invoke(cli, ["compare", "--help"]) + assert result.exit_code == 0 + + def test_summary_help(self, runner): + result = runner.invoke(cli, ["summary", "--help"]) + assert result.exit_code == 0 + + def test_prepare_rsync_help(self, runner): + result = runner.invoke(cli, ["prepare-rsync", "--help"]) + assert result.exit_code == 0 From c6f5d0fe9237de6ff9157e621e0beaad743b79f7 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 02:30:21 +0200 Subject: [PATCH 3/7] Update README with accurate and complete documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CI, license, and Python version badges - Add table of contents and command summary table - Fix filename: environment.yml → environment.yaml - Fix typos: comapre, minumin, invloved, Denpending - Add dev installation instructions (pip install -e ".[dev]") - Add note on checksum algorithm trade-offs (imohash sampling behaviour) - Add note that both CSVs must use the same algorithm before comparing - Clarify prepare-rsync examples (works/fails sections) - Add Contributing and License sections with links - Update license statement: MIT → GPL v3 Co-Authored-By: Claude Sonnet 4.6 --- README.md | 296 ++++++++++++++++++++++++++---------------------------- 1 file changed, 140 insertions(+), 156 deletions(-) diff --git a/README.md b/README.md index 4d86308..529511b 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,68 @@ -# Ptool +# ptool -A tool to manage pools across different sites. With Ptool it is possible to take -snap-shot of a pool at regular intervals and compare them to monitor the changes -to the pool overtime. These snap-shots can be from the same machine or from -different machines. It also provides tools to sync the state between pool on -different machines. +[![CI](https://github.com/esm-tools/pool_tool/actions/workflows/ci.yml/badge.svg)](https://github.com/esm-tools/pool_tool/actions/workflows/ci.yml) +[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](LICENSE) +[![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org) + +A tool to manage data pools across different HPC sites. With ptool you can take +snapshots of a pool at regular intervals and compare them to monitor changes over +time. Snapshots can be taken on the same machine or on different machines, and +ptool provides tools to synchronise the state between pools on different machines. + +## Contents + +- [Installation](#installation) +- [Usage](#usage) + - [checksums](#checksums-snapshot-of-a-pool) + - [summary](#summary) + - [compare](#compare) + - [prepare-rsync](#prepare-rsync) +- [Checksum algorithms](#checksum-algorithms) +- [Contributing](#contributing) ## Installation -The current approach is to clone the repository on the machine where `Ptool` needs -to installed +Clone the repository on the machine where ptool needs to be installed: -``` shell +```shell git clone https://github.com/esm-tools/pool_tool.git cd pool_tool ``` -Create a virtual environment, either using `conda` or `pyvenv` to install the package. - -### Using Conda +Create a conda environment and install the package: -``` shell -conda env create -f environment.yml +```shell +conda env create -f environment.yaml conda activate ptool -pip install . +pip install -e . +``` + +For development (includes test dependencies): + +```shell +pip install -e ".[dev]" +pytest tests/ -v ``` -### Usage +## Usage -`Ptool` provides 4 commands to manage the pool. - - `checksums` to create snap-shot of the pool - - `summary` to get an overview by comparing 2 snap-shots - - `compare` to write concrete results of comparing 2 snap-shots - - `prepare-rsync` produces script to transfer files from one machine to another - -#### `checksums` (snap-shot of pool) +ptool provides four commands to manage pools: -Getting the help text to see the all the options +| Command | Purpose | +|---------|---------| +| `checksums` | Create a snapshot of a pool as a CSV | +| `summary` | High-level overview of two snapshots | +| `compare` | Per-file comparison of two snapshots | +| `prepare-rsync` | Generate a script to transfer files between sites | -``` shell +### checksums (snapshot of a pool) + +```shell $ ptool checksums --help Usage: ptool checksums [OPTIONS] PATH - Calculates imohash checksum of file(s) at the given path. Results are - presented as csv. + Calculates checksum of file(s) at the given path. + Results are presented as csv. `--ignore` and `--ignore-dirs` support *wildcards* in filtering down the matches. If no *wildcards* are provided, then it performs a literal match. @@ -51,18 +70,16 @@ Usage: ptool checksums [OPTIONS] PATH Options: --drop-hidden-files / --no-drop-hidden-files - ignore hidden files [default: drop-hidden- - files] + ignore hidden files [default: drop-hidden-files] --ignore TEXT ignore files --ignore-dirs TEXT ignore directories -o, --outfile FILENAME output filename --help Show this message and exit. ``` -Lets say, `Ptool` is installed on Levante and the pool to take snap-shot is -`fesom2` project, then invoke `checksum` as follows: +Taking a snapshot of the `fesom2` pool on Levante: -``` shell +```shell $ ptool checksums --ignore-dirs dist_* -o levante_fesom2.csv /pool/data/AWICM/FESOM2 Gathering files... skipping.. /pool/data/AWICM/FESOM2/FORCING/ERA5 -> /mnt/lustre01/work/ba1138/a270099/era5/forcing/inverted @@ -74,41 +91,31 @@ calculating hashes Elapsed 3.21s Writing results to levante_fesom2.csv ``` +The output CSV has four columns: `checksum`, `fsize`, `mtime`, `fpath`. + +> **Note:** The filename you choose for the CSV is used as the site label in +> analysis output, so pick a meaningful name (e.g. `levante_fesom2.csv`). + #### Remote checksums -It is also possible to get the `checksums` of pool on the remote site. Lets say -we are on Albedo machine and want to compare snap-shot for the project `fesom2` -from both Albedo and Levante then we can also directly invoke `checksums` -command on Levante from Albedo using ssh command as follows: +You can invoke `checksums` on a remote machine via SSH and pipe the output +to a local file. This is useful when you want to collect snapshots on the +machine where analysis will be run: -``` shell -$ ssh a270243@levante.dkrz.de "~/miniforge3/envs/ptool/bin/ptool checksums /pool/data/AWICM/FESOM2 --ignore-dirs dist_*" > levante_fesom2.csv -Gathering files... -skipping.. /pool/data/AWICM/FESOM2/FORCING/ERA5 -> /mnt/lustre01/work/ba1138/a270099/era5/forcing/inverted -getting files Elapsed 0.22s -nfiles: 3060 -Calculating hashes... -100%|██████████| 3060/3060 [00:01<00:00, 1610.56files/s] -calculating hashes Elapsed 3.21s -Writing results to +```shell +$ ssh a270243@levante.dkrz.de \ + "~/miniforge3/envs/ptool/bin/ptool checksums /pool/data/AWICM/FESOM2 --ignore-dirs dist_*" \ + > levante_fesom2.csv ``` -In the above command, since there is no `-o/--outfile` option provided, the -results are written to `` which is piped to a local file on Albedo. It -is also certainly possible to write the snap-shot results to a file on Levante -and then copy it over to Albedo. It is up-to the user, how they want to trigger -the computation but the main point here is the user is supposed to gather the -snap-shots on to the machine where further analysis is carried out. - -NOTE: -User is free to choose any meaningful name for the csv file as they see fit. The -same name is used in displaying the results in the analysis part. +When no `-o/--outfile` is given, results are written to stdout and can be +redirected to a local file. -#### Summary +### summary -Lets say we have computed the snap-shot for the project pool `fesom2` on both Levante and Albedo, then invoke `summary` as follows to get a quick overview of the states these pool are in +Get a high-level overview of two snapshots: -``` shell +```shell $ ptool summary --compact levante_fesom2.csv albedo_fesom2.csv Table 1: Summary with respect to LEVANTE_FESOM2 @@ -152,145 +159,122 @@ MESHES/CORE2 - 9 - 9 ---------------------------------------------------------------------- ``` -There are more options available for `summary` command to alter the results. Use -the `ptool summary --help` to see and investigate other options. +Run `ptool summary --help` to see all available options. -#### comapre +### compare -To get the specifics of the per-files associations, use the compare command as -follows +Get per-file comparison results: -``` shell -$ ptool compare -o lev_alb_fesom2_cmp.csv levante_fesom2.csv albedo_fesom2.csv +```shell +$ ptool compare -o lev_alb_fesom2_cmp.csv levante_fesom2.csv albedo_fesom2.csv Writing results as csv to file lev_alb_fesom2_cmp.csv rpath_left rpath_right -flag +flag identical FORCING/CORE2/ncar_precip.1948.nc /forcing/CORE2/ncar_precip.1948.nc identical FORCING/CORE2/ncar_precip.1952.nc /forcing/CORE2/ncar_precip.1952.nc -identical FORCING/CORE2/ncar_precip.1955.nc /forcing/CORE2/ncar_precip.1955.nc -identical FORCING/CORE2/ncar_precip.1953.nc /forcing/CORE2/ncar_precip.1953.nc -identical FORCING/CORE2/ncar_precip.1956.nc /forcing/CORE2/ncar_precip.1956.nc -... ... ... +... unique FORCING/era5/forcing/inverted/t2m.1972.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1940.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1948.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1960.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1956.nc NaN [3014 rows x 2 columns] ``` -The comparison results are written to `lev_alb_fesom2_cmp.csv` file. Explore the -contents this file using your favorite editor or search for specific entries -using the `grep` command. For instance, using `grep` to finding the occurrences -of folder `MESHES_FESOM2.1/hr` +Each row is classified as one of: `identical`, `renamed`, `modified_latest_left`, +`modified_latest_right`, or `unique`. + +You can search the output with `grep`: -``` shell +```shell $ grep MESHES_FESOM2.1/hr lev_alb_fesom2_cmp.csv identical,MESHES_FESOM2.1/hr/edgenum.out,/HR/edgenum.out -identical,MESHES_FESOM2.1/hr/elvls.out,/HR/elvls.out -identical,MESHES_FESOM2.1/hr/hr_griddes_elements.nc,/HR/hr_griddes_elements.nc -identical,MESHES_FESOM2.1/hr/hr_zaxis.txt,/HR/hr_zaxis.txt -identical,MESHES_FESOM2.1/hr/nlvls.out,/HR/nlvls.out -identical,MESHES_FESOM2.1/hr/elem2d.out,/HR/elem2d.out -identical,MESHES_FESOM2.1/hr/hr_griddes_elements_IFS.nc,/HR/hr_griddes_elements_IFS.nc -identical,MESHES_FESOM2.1/hr/nod2d.out,/HR/nod2d.out -identical,MESHES_FESOM2.1/hr/fesom.mesh.diag.nc,/HR/fesom.mesh.diag.nc -identical,MESHES_FESOM2.1/hr/aux3d.out,/HR/aux3d.out -identical,MESHES_FESOM2.1/hr/hr_griddes_nodes.nc,/HR/hr_griddes_nodes.nc -identical,MESHES_FESOM2.1/hr/hr_griddes_nodes_IFS.nc,/HR/hr_griddes_nodes_IFS.nc -identical,MESHES_FESOM2.1/hr/edge_tri.out,/HR/edge_tri.out -identical,MESHES_FESOM2.1/hr/edges.out,/HR/edges.out +... modified_latest_right,MESHES_FESOM2.1/hr/README.md,/HR/README.md unique,MESHES_FESOM2.1/hr/README, ``` -#### prepare-rsync +> **Important:** Both CSVs must have been generated with the same checksum +> algorithm. ptool will raise an error if you try to compare snapshots taken +> with different algorithms. -This command produces a shell script which contains a list of rsync commands to -be executed. Before running the shell script, it is recommended to check the -contents of this file to see if `prepare-rsync` has produced the desired -result. User can directly manipulate the shell script to adjust for minor -artifacts in-case the options offered by the command does not yield the exact -result the user is expecting. Please check out the `--help` command for details -on the options along with few examples of invoking this command. +### prepare-rsync -``` shell +Generates a shell script containing rsync commands to transfer files between +sites. Always review the script before executing it. + +```shell $ ptool prepare-rsync --help Usage: ptool prepare-rsync [OPTIONS] LEFT RIGHT Prepares rsync commands for the transfer. - Denpending on where data needs to pushed or pulled, provide either - `--lefthost` or `--righthost` information to prefix that path. - - Note: when Albedo system is invloved, run this command on Albedo and provide - the other host information as Albedo can not be reached from other machines. - - Examples that WORK: + Depending on where data needs to be pushed or pulled, provide either + `--lefthost` or `--righthost` to prefix that path. - # commands executed on Albedo (i.e., we are on Albedo) + Note: when Albedo is involved, run this command on Albedo and provide the + other host information, as Albedo cannot be reached from external machines. - 1. sync data: Levante -> Albedo - - ptool prepare-rsync --lefthost user@levante.dkrz.de - checksum_levante_fesom2.csv checksum_albedo_fesom2.csv +Options: + --ignore TEXT ignores directory and files + --flags [unique|modified|both] association type to include + -t, --threshold FLOAT minimum value to satisfy valid association + [default: 0.1] + -l, --lefthost TEXT username@host prefix for the left path + -r, --righthost TEXT username@host prefix for the right path + --help Show this message and exit. +``` - 2. sync data: Albedo -> Levante +Transferring files from Levante to Albedo (run this on Albedo): - ptool prepare-rsync --righthost user@levante.dkrz.de - checksum_albedo_fesom2.csv checksum_levante_fesom2.csv +```shell +$ ptool prepare-rsync --lefthost a270243@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv +Created sync_cmd.sh +``` - Examples that FAIL: +Verify the contents of `sync_cmd.sh` before executing: - # commands executed on Levante (i.e., we are on Levante) +```shell +$ grep MESHES_FESOM2.1/hr sync_cmd.sh +# MESHES_FESOM2.1/hr +rsync -av --files-from=flist/95087e3b a270243@levante.dkrz.de:/pool/data/AWICM/FESOM2/MESHES_FESOM2.1/hr/ /albedo/pool/FESOM2/HR/ +``` - 1. sync data: Levante -> Albedo +#### Examples that work (run on Albedo) - ptool prepare-rsync --righthost user@albedo0.dmawi.de - checksum_levante_fesom2.csv checksum_albedo_fesom2.csv +```shell +# Sync data: Levante → Albedo +ptool prepare-rsync --lefthost user@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv - will produce rsync commands as follows: +# Sync data: Albedo → Levante +ptool prepare-rsync --righthost user@levante.dkrz.de albedo_fesom2.csv levante_fesom2.csv +``` - rsync /some/path/on/levante user@albedo0.dmawi.de:/some/path/on/albedo +#### Examples that fail (run on Levante) - Although syntactically correct command, it fails as Albedo is not - reachable from other machines +```shell +# This fails — Albedo is not reachable from external machines +ptool prepare-rsync --righthost user@albedo0.dmawi.de levante_fesom2.csv albedo_fesom2.csv +``` - 2. sync data: Albedo -> Levante +## Checksum algorithms - ptool prepare-rsync --lefthost user@albedo0.dmawi.de - checksum_albedo_fesom2.csv checksum_levante_fesom2.csv +ptool uses [imohash](https://github.com/kalafut/py-imohash) by default. imohash +is a fast sampling hasher: for files larger than 128 KB it reads only three +16 KB windows (start, middle, end) rather than the full file. This makes +snapshot generation very fast even for large pools of climate data. -Options: - --ignore TEXT ignores directory and files - --flags [unique|modified|both] association type to include - -t, --threshold FLOAT minumin value to satisfy valid association - [default: 0.1] - -l, --lefthost TEXT username@host prefix to the path for left - file - -r, --righthost TEXT username@host prefix to the path for right - file - --help Show this message and exit. -``` +**Trade-off:** Two files that are identical in the sampled regions but differ +elsewhere will be incorrectly classified as identical (a false negative). For +most sync workflows this risk is acceptably low, but it is worth being aware of. -Assuming we are on Albedo and want to transfer the files from Levante to Albedo, -invoke the `prepare-rsync` command as follows: +| Algorithm | Reads | False negatives possible? | Best for | +|-----------|-------|--------------------------|----------| +| `imohash` | 3 × 16 KB sample | Yes, for large files | Fast snapshots of large pools | -``` shell -$ ptool prepare-rsync --lefthost a270243@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv -Created sync_cmd.sh -``` +## Contributing -Verify the contents of `sync_cmd.sh` before executing the script. It is also -possible to directly edit this file to remove selected files from the -transaction. +Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for +setup instructions and the contribution workflow. -To give a sneak-peak into the `sync_cmd.sh` file, looking for -`MESHES_FESOM2.1/hr` entry as follows: +## License -``` shell -$ grep MESHES_FESOM2.1/hr sync_cmd.sh -# MESHES_FESOM2.1/hr -rsync -av --files-from=flist/95087e3b a270243@levante.dkrz.de:/pool/data/AWICM/FESOM2/MESHES_FESOM2.1/hr/ /albedo/pool/FESOM2/HR/ -``` +This project is licensed under the GNU General Public License v3.0 — see the +[LICENSE](LICENSE) file for details. From c5666deee240bbc636a70d3dd1b7a9c4f9df3790 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 03:04:25 +0200 Subject: [PATCH 4/7] Fix structural issues identified in architecture review - Delete ptool/utils.py: dead code that referenced a non-existent ptool_config.yaml and imported yaml (not a declared dependency) - Remove standalone cli() entry point from checksums.py: the registered entry point is ptool.cli:cli; the duplicate was never invoked - Fix prepare_rsync double-open bug: replace open(outfile, "w") (which passes a file object to open()) with outfile.write() on the Click-provided handle; fix hardcoded "sync_cmd.sh" in success message - Clarify sanitise() docstring: document the AWI shared-filesystem rationale for the awi.de special case - Fix DataFrame site/filename attributes: replace monkey-patching with a _SnapshotFrame subclass using pandas _metadata + _constructor so those attributes survive merges, filters, and copies - Fix docstring escape sequences in split(): \, -> \\, (DeprecationWarning) - Replace groupby.apply in merge() with groupby.size() + idxmax(): eliminates FutureWarning about grouping columns being excluded from apply; same max-association-wins algorithm, no apply() needed Co-Authored-By: Claude Sonnet 4.6 --- README.md | 34 ++++++++++++++++++++++++++ ptool/analyse.py | 53 +++++++++++++++++++++-------------------- ptool/checksums.py | 40 +++---------------------------- ptool/cli.py | 17 +++++++------ ptool/utils.py | 59 ---------------------------------------------- 5 files changed, 72 insertions(+), 131 deletions(-) delete mode 100644 ptool/utils.py diff --git a/README.md b/README.md index 529511b..3ada068 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,9 @@ ptool provides tools to synchronise the state between pools on different machine ## Contents +- [How it works](#how-it-works) + - [File association](#file-association) + - [Folder mapping](#folder-mapping) - [Installation](#installation) - [Usage](#usage) - [checksums](#checksums-snapshot-of-a-pool) @@ -20,6 +23,37 @@ ptool provides tools to synchronise the state between pools on different machine - [Checksum algorithms](#checksum-algorithms) - [Contributing](#contributing) +## How it works + +ptool compares pools by taking a **snapshot** (a CSV of checksums + metadata) of each site and +analysing the two CSVs locally. No direct connection between sites is required at analysis time. + +### File association + +Every file pair is classified using two signals: whether the checksum matches and whether the +filename matches. + +| Checksum | Filename | Classification | +|----------|----------|----------------| +| same | same | `identical` — exact copy | +| different | same | `modified` — content changed; mtime indicates which copy is newer | +| same | different | `renamed` — content preserved, name changed | +| different | different | `unique` — file exists only on the left site | + +`compare` and `summary` use these labels to report what needs to be transferred and what is +already in sync. + +### Folder mapping + +Pools on different sites often use different directory layouts. ptool automatically maps +corresponding folders by finding which folder pair has the highest number of matching files +(**max-association wins**). Each folder on the left is paired with at most one folder on the right. + +The `--threshold` option (default `0.1`) controls false-positive filtering: if fewer than 10 % of +files in a left folder are associated with a right folder, the association is treated as noise and +those files are reclassified as `unique`. Raise the threshold to be stricter; lower it to allow +sparse associations. + ## Installation Clone the repository on the machine where ptool needs to be installed: diff --git a/ptool/analyse.py b/ptool/analyse.py index 151e1f2..ea380df 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -16,9 +16,19 @@ ] +class _SnapshotFrame(pd.DataFrame): + """DataFrame subclass that carries snapshot metadata through pandas operations.""" + + _metadata = ["site", "filename"] + + @property + def _constructor(self): + return _SnapshotFrame + + def read_csv(filename, ignore=None, drop_duplicates=False): filename = os.path.expanduser(filename) - df = pd.read_csv(filename, engine="pyarrow") + df = _SnapshotFrame(pd.read_csv(filename, engine="pyarrow")) df = df.rename(columns={"fname": "fpath"}) df = df[df.checksum != "-"] df["fname"] = df.fpath.apply(os.path.basename) @@ -32,41 +42,32 @@ def read_csv(filename, ignore=None, drop_duplicates=False): df = df[~df.rparent.str.contains(ignore)] df = df[~df.fname.str.contains(ignore)] df = df.sort_values(by=["checksum", "mtime"]) - dups = df[ - df.duplicated(subset=["checksum", "fname"]).values - # df.duplicated(subset=["checksum"]).values - ] + dups = df[df.duplicated(subset=["checksum", "fname"]).values] if drop_duplicates: df = df.drop_duplicates(subset=["checksum", "fname"]) - # df = df.drop_duplicates(subset=["checksum"]) df["mtime"] = pd.to_datetime(df["mtime"], unit="s") - # _, pool, site = os.path.basename(filename).split("_") - # site, _ = os.path.splitext(site) df.filename = filename - # df.pool = pool - # df.site = site df.site = os.path.splitext(os.path.basename(filename))[0] - dups.filename = filename - # dups.pool = pool - # dups.site = site - dups.site = os.path.splitext(os.path.basename(filename))[0] return df, dups -def _group_with_max_counts(df, key="rparent_right"): - a = [(len(group), group) for gname, group in df.groupby(key)] - count, group = sorted(a, key=lambda x: x[0]).pop() - return group - - def merge(dl, da, on="checksum", how="inner"): m = pd.merge(dl, da, on=on, how=how, suffixes=("_left", "_right")) - mm = m.groupby("rparent_left").apply(_group_with_max_counts).reset_index(drop=True) - mm = ( - mm.groupby("rparent_right") - .apply(lambda x: _group_with_max_counts(x, key="rparent_left")) - .reset_index(drop=True) - ) + # For each left folder, keep only rows belonging to the right folder + # with the most file matches (max-association wins), then repeat from + # the right side to enforce a 1:1 folder pairing. + pair_counts = m.groupby(["rparent_left", "rparent_right"]).size().reset_index(name="_n") + best_right = pair_counts.loc[ + pair_counts.groupby("rparent_left")["_n"].idxmax(), + ["rparent_left", "rparent_right"], + ] + mm = m.merge(best_right, on=["rparent_left", "rparent_right"]) + pair_counts2 = mm.groupby(["rparent_left", "rparent_right"]).size().reset_index(name="_n") + best_left = pair_counts2.loc[ + pair_counts2.groupby("rparent_right")["_n"].idxmax(), + ["rparent_left", "rparent_right"], + ] + mm = mm.merge(best_left, on=["rparent_left", "rparent_right"]) return mm diff --git a/ptool/checksums.py b/ptool/checksums.py index f2d19a4..3d2b823 100755 --- a/ptool/checksums.py +++ b/ptool/checksums.py @@ -62,11 +62,11 @@ def split(s: str, sep: str = ",", escape: str = "\\") -> List[Optional[str]]: >>> split("core1,core2") ["core1", "core2"] - >>> split("core1\,group,core2") + >>> split("core1\\,group,core2") ["core1,group", "core2"] - >>> split("a,b\,c,d") + >>> split("a,b\\,c,d") ['a', 'b,c', 'd'] - >>> split("a|b\|c|d", sep="|") + >>> split("a|b\\|c|d", sep="|") ['a', 'b|c', 'd'] >>> split("a|b#|c|d", sep="|", escape="#") ['a', 'b|c', 'd'] @@ -215,37 +215,3 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True): outfile.writelines(results) -@click.command() -@click.option( - "--drop-hidden-files/--no-drop-hidden-files", - default=True, - is_flag=True, - show_default=True, - help="ignore hidden files", -) -@click.option("--ignore", default=None, show_default=True, help="ignore files") -@click.option( - "--ignore-dirs", default=None, show_default=True, help="ignore directories" -) -@click.option( - "-o", "--outfile", type=click.File("w"), default="-", help="output filename" -) -@click.argument("path") -def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files): - """path to file or folder. - - Calculates imohash checksum of file(s) at the given path. - Results are presented as csv. - """ - path = os.path.expanduser(path) - main( - path=path, - outfile=outfile, - ignore=ignore, - ignore_dirs=ignore_dirs, - drop_hidden_files=drop_hidden_files, - ) - - -if __name__ == "__main__": - cli() diff --git a/ptool/cli.py b/ptool/cli.py index f17ea42..8ec83a7 100644 --- a/ptool/cli.py +++ b/ptool/cli.py @@ -112,7 +112,12 @@ def summary(ignore, drop_duplicates, compact, threshold, left, right): def sanitise(host, path): - "sanitise the hostpart of the path" + """Prefix path with host for rsync. + + AWI machines (*.awi.de) use a shared filesystem visible from both Levante + and Albedo, so no host prefix is needed — the path is used as-is. All + other remote hosts get the standard ``user@host:path`` prefix. + """ if (not host) or ("awi.de" in host): return path return f"{host}:{path}" @@ -260,14 +265,8 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r syncs.append("rm -rf flist") # syncs.append(f"rm {names}") syncs = "\n".join(syncs) - # os.makedirs("flist", exist_ok=True) - with open(outfile, "w") as fid: - fid.writelines(syncs) - # for name, fnames in fmap.items(): - # with open(f"flist/{name}", "w") as fid: - # fnames = "\n".join(fnames) - # fid.writelines(fnames) - print("Created sync_cmd.sh") + outfile.write(syncs) + print(f"Created {outfile.name}") @cli.command() diff --git a/ptool/utils.py b/ptool/utils.py deleted file mode 100644 index 444eff1..0000000 --- a/ptool/utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import re -import socket -import yaml - -BASEPATH_SCRIPTS = os.path.dirname(os.path.abspath(__file__)) -CONFIG_PATH_DEFAULT = f"{BASEPATH_SCRIPTS}/ptool_config.yaml" - - -def determine_computer_from_hostname(config_path=CONFIG_PATH_DEFAULT, verbose=True): - """ - Determines which yaml config file is needed for this computer. - Copied and edited from `ESM-Tools` (``esm_tools/src/esm_parser/esm_parser.py``) - - Notes - ----- - The machine must be registered in the ``ptool_config.yaml`` file in - order to be found. - - Input - ----- - config_path : str - Path to the ``ptool_config.yaml`` - - Returns - ------- - str - A string with the name of the machine as described in ``ptool_config.yaml``. If - pattern not matched it returns ``"local"`` - """ - with open(config_path, "r") as f: - all_computers = yaml.load(f, Loader=yaml.SafeLoader) - - for computer_name, info in all_computers.items(): - nodes = info.get("node_names", []) - for computer_pattern in nodes.values(): - if isinstance(computer_pattern, str): - if re.match(computer_pattern, socket.gethostname()) or re.match( - computer_pattern, socket.getfqdn() - ): - return computer_name - - elif isinstance(computer_pattern, (list, tuple)): - computer_patterns = computer_pattern - for pattern in computer_patterns: - if re.match(pattern, socket.gethostname()): - return computer_name - - if verbose: - print( - "Continues assuming that you are running under a ``local`` machine (no " - f"matching pattern in ``{config_path}``)" - ) - - return "local" - - -if __name__ == "__main__": - print(determine_computer_from_hostname()) From d2d99c0caa3c49aea001f07453b064f96f490652 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 03:31:25 +0200 Subject: [PATCH 5/7] Add TF-IDF weighted folder association and md5 column compatibility Replace raw file-count scoring in merge() with TF-IDF weighting: - Filenames appearing in many left directories (generic sequential names like my_list00001.out, com_info00001.out) receive low weight IDF = log(total_dirs / dirs_containing_fname) - Filenames unique to one directory carry full weight, correctly anchoring the folder pairing to content-specific files rather than structural noise - Also accept 'md5' as a column name alias for 'checksum' in read_csv(), enabling compatibility with older CSV snapshots Add TestTFIDFFolderMapping with two tests: - test_tfidf_prefers_specific_over_generic: constructs a case where raw count maps dir_a to the wrong directory (3 generic matches > 2 specific matches); TF-IDF correctly maps via the unique anchor file - test_unique_files_still_map_correctly: confirms TF-IDF degrades gracefully to correct behaviour when all filenames are unique Co-Authored-By: Claude Sonnet 4.6 --- ptool/analyse.py | 33 +++++++++++------ tests/test_analyse.py | 86 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 11 deletions(-) diff --git a/ptool/analyse.py b/ptool/analyse.py index ea380df..9aa87f4 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -29,7 +29,7 @@ def _constructor(self): def read_csv(filename, ignore=None, drop_duplicates=False): filename = os.path.expanduser(filename) df = _SnapshotFrame(pd.read_csv(filename, engine="pyarrow")) - df = df.rename(columns={"fname": "fpath"}) + df = df.rename(columns={"fname": "fpath", "md5": "checksum"}) df = df[df.checksum != "-"] df["fname"] = df.fpath.apply(os.path.basename) df["prefix"] = os.path.commonpath(list(df.fpath.apply(os.path.dirname))) @@ -53,21 +53,31 @@ def read_csv(filename, ignore=None, drop_duplicates=False): def merge(dl, da, on="checksum", how="inner"): m = pd.merge(dl, da, on=on, how=how, suffixes=("_left", "_right")) - # For each left folder, keep only rows belonging to the right folder - # with the most file matches (max-association wins), then repeat from - # the right side to enforce a 1:1 folder pairing. - pair_counts = m.groupby(["rparent_left", "rparent_right"]).size().reset_index(name="_n") - best_right = pair_counts.loc[ - pair_counts.groupby("rparent_left")["_n"].idxmax(), + # TF-IDF weighted folder association: + # down-weight filenames that appear in many left directories (generic sequential + # names like my_list00001.out) so they don't drive false folder pairings. + # Files unique to one directory carry full weight; files ubiquitous across the + # pool carry near-zero weight. IDF = log(total_dirs / dirs_containing_fname). + fname_col = "fname_left" if "fname_left" in m.columns else "fname" + n_dirs = max(dl["rparent"].nunique(), 1) + n_dirs_per_fname = dl.groupby("fname")["rparent"].nunique() + idf = np.log(n_dirs / n_dirs_per_fname).clip(lower=0.0) + m["_score"] = m[fname_col].map(idf).fillna(0.0) + # For each left folder, pick the right folder with the highest TF-IDF score, + # then repeat from the right side to enforce 1:1 pairing. + pair_scores = m.groupby(["rparent_left", "rparent_right"])["_score"].sum().reset_index() + best_right = pair_scores.loc[ + pair_scores.groupby("rparent_left")["_score"].idxmax(), ["rparent_left", "rparent_right"], ] mm = m.merge(best_right, on=["rparent_left", "rparent_right"]) - pair_counts2 = mm.groupby(["rparent_left", "rparent_right"]).size().reset_index(name="_n") - best_left = pair_counts2.loc[ - pair_counts2.groupby("rparent_right")["_n"].idxmax(), + pair_scores2 = mm.groupby(["rparent_left", "rparent_right"])["_score"].sum().reset_index() + best_left = pair_scores2.loc[ + pair_scores2.groupby("rparent_right")["_score"].idxmax(), ["rparent_left", "rparent_right"], ] mm = mm.merge(best_left, on=["rparent_left", "rparent_right"]) + mm = mm.drop(columns=["_score"]) return mm @@ -181,7 +191,8 @@ def _correct_false_positive(df, threshold=0.1): ] + partial_dfs ) - df = df.replace(r"^\s*$", np.nan, regex=True) + with pd.option_context("future.no_silent_downcasting", True): + df = df.replace(r"^\s*$", np.nan, regex=True) df.index = df.index.set_names("flag") return df diff --git a/tests/test_analyse.py b/tests/test_analyse.py index a549ad5..5e7dd18 100644 --- a/tests/test_analyse.py +++ b/tests/test_analyse.py @@ -157,3 +157,89 @@ def test_directory_map_returns_pairs(self, two_identical_pools): dm = directory_map(m) assert "rparent_left" in dm.columns assert "rparent_right" in dm.columns + + +class TestTFIDFFolderMapping: + """TF-IDF weighted association should prefer specific (rare) filenames over + generic (ubiquitous) ones when mapping directories across sites. + + Scenario mirroring the real-world fArc_sorted false mapping: + Left side has three directories that all share three generic files + (IDF ≈ 0). dir_a also has one specific file (IDF = log 3 ≈ 1.10). + Right side has: + dir_x — specific_a + 1 generic (2 raw matches with dir_a) + dir_y — 3 generics (3 raw matches with dir_a — wins on raw count) + Raw count would map dir_a → dir_y (wrong). + TF-IDF maps dir_a → dir_x (correct). + """ + + def _build_pool(self, base, subdirs, files_per_subdir): + """Create directory tree and return {relative_path: content} dict.""" + for d in subdirs: + (base / d).mkdir(parents=True, exist_ok=True) + result = {} + for subdir, files in files_per_subdir.items(): + for fname, content in files.items(): + result[f"{subdir}/{fname}"] = content + return result + + def test_tfidf_prefers_specific_over_generic(self, tmp_path): + generic = b"\xAA" * 512 + generic2 = b"\xBB" * 512 + generic3 = b"\xCC" * 512 + specific = b"\xDD" * 512 + + left_base = tmp_path / "left" + for d in ["dir_a", "dir_b", "dir_c"]: + (left_base / d).mkdir(parents=True) + + # dir_a: 3 generic files (appear in all 3 left dirs) + 1 specific file + # dir_b, dir_c: only the 3 generic files → IDF of generics = log(3/3) = 0 + left_files = self._build_pool(left_base, [], { + "dir_a": {"generic.nc": generic, "generic2.nc": generic2, + "generic3.nc": generic3, "specific_a.nc": specific}, + "dir_b": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + "dir_c": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + }) + left_csv = _make_csv(left_base, "left", left_files) + + right_base = tmp_path / "right" + for d in ["dir_x", "dir_y"]: + (right_base / d).mkdir(parents=True) + + # dir_x: specific_a + 1 generic → 2 raw matches with dir_a + # dir_y: all 3 generics → 3 raw matches with dir_a (raw count picks this — wrong) + right_files = self._build_pool(right_base, [], { + "dir_x": {"specific_a.nc": specific, "generic.nc": generic}, + "dir_y": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + }) + right_csv = _make_csv(right_base, "right", right_files) + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + dm = directory_map(m) + + # dir_a must map to dir_x (driven by specific_a), not dir_y (more generic matches) + dir_a_row = dm[dm.rparent_left.str.endswith("dir_a")] + assert len(dir_a_row) == 1, "dir_a should have exactly one mapping" + assert dir_a_row.rparent_right.iloc[0].endswith("dir_x"), ( + f"dir_a mapped to {dir_a_row.rparent_right.iloc[0]!r} — " + "expected dir_x (specific file); raw-count would pick dir_y" + ) + + def test_unique_files_still_map_correctly(self, tmp_path): + """When all filenames are unique (no generics), TF-IDF behaves like raw count.""" + left_base = tmp_path / "left" + right_base = tmp_path / "right" + (left_base / "pool").mkdir(parents=True) + (right_base / "pool").mkdir(parents=True) + + content = b"\xEE" * 512 + left_csv = _make_csv(left_base, "left", {"pool/data.nc": content}) + right_csv = _make_csv(right_base, "right", {"pool/data.nc": content}) + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + assert len(m) == 1 From ab5bc24b53d2284185be2bb3846bac4d9ba16199 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 03:39:43 +0200 Subject: [PATCH 6/7] Add 100%-modified rate detector for spurious folder mappings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a folder pair shares filenames but all matched files differ in content (0 identical, 100% modified), the mapping is likely spurious — e.g. MPI partition files generated for different core counts. Adds _suspicious_pairs() to detect this and marks affected rows with [!] in the summary directory map table. Co-Authored-By: Claude Sonnet 4.6 --- ptool/analyse.py | 31 +++++++++++++++++ tests/test_analyse.py | 77 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 1 deletion(-) diff --git a/ptool/analyse.py b/ptool/analyse.py index 9aa87f4..3404b5c 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -85,6 +85,27 @@ def directory_map(m): return (m[["rparent_left", "rparent_right"]]).drop_duplicates() +def _suspicious_pairs(cmp): + """Return (rparent_left, rparent_right) pairs where every matched file is + modified and none are identical. + + When two directories share filename patterns but hold different data (e.g. + MPI partition files for different core counts), all files appear as modified + and zero are identical. That pattern is a reliable indicator that the folder + mapping is spurious rather than a genuine sync mismatch. + """ + c = cmp.reset_index() + paired = c[c["flag"] != "unique"] + bad = set() + for rparent_left, group in paired.groupby("rparent_left"): + flags = set(group["flag"].unique()) + has_modified = bool(flags & {"modified_latest_left", "modified_latest_right"}) + if has_modified and "identical" not in flags: + for rp_right in group["rparent_right"].dropna().unique(): + bad.add((rparent_left, str(rp_right))) + return bad + + def compare(left, right, relabel=False, threshold=0.1): by_hash = merge(left, right) by_name = merge(left, right, on="fname") @@ -335,6 +356,7 @@ def summary( # dmap = directory_map(m) # dmap = dmap[dmap.rparent_left != dmap.rparent_right] dmap = (cmp[["rparent_left", "rparent_right"]]).dropna().drop_duplicates() + suspicious = _suspicious_pairs(cmp) print("-" * 70) if not dmap.empty: dmap.columns = [ @@ -342,8 +364,17 @@ def summary( for c in dmap.columns ] dmap = dmap.reset_index(drop=True) + if suspicious: + left_col = f"rparent_{left_site}" + right_col = f"rparent_{right_site}" + dmap["note"] = dmap.apply( + lambda r: "[!]" if (r[left_col], r[right_col]) in suspicious else "", + axis=1, + ) print(f"\nTable {next(table_no)}: Common directory mapping\n") print(tabulate.tabulate(dmap, headers="keys")) + if suspicious: + print("\n[!] 100% modified, 0 identical — folder mapping is likely spurious.") print("-" * 70) c = cmp.reset_index() diff --git a/tests/test_analyse.py b/tests/test_analyse.py index 5e7dd18..6b3edd6 100644 --- a/tests/test_analyse.py +++ b/tests/test_analyse.py @@ -3,7 +3,7 @@ import pytest -from ptool.analyse import compare, compare_compact, directory_map, merge, read_csv +from ptool.analyse import _suspicious_pairs, compare, compare_compact, directory_map, merge, read_csv from ptool.checksums import stats @@ -243,3 +243,78 @@ def test_unique_files_still_map_correctly(self, tmp_path): right, _ = read_csv(right_csv) m = merge(left, right) assert len(m) == 1 + + +def _make_csv_explicit(tmp_path, name, entries): + """Write a snapshot CSV with hand-crafted (checksum, fsize, mtime, fpath) rows. + + Bypasses the filesystem so mtime can be controlled precisely — useful for + tests that need to distinguish modified_latest_left from modified_latest_right. + """ + records = ["checksum,fsize,mtime,fpath"] + for checksum, fsize, mtime, fpath in entries: + records.append(f"{checksum},{fsize},{mtime},{fpath}") + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +class TestSuspiciousPairs: + """_suspicious_pairs() should flag folder mappings where every paired file + is modified and none are identical — the signature of a false association.""" + + def test_all_modified_flagged(self, tmp_path): + """Two dirs sharing filenames but different content → suspicious.""" + left_csv = _make_csv_explicit(tmp_path, "left", [ + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/file1.nc"), + ("imohash:bbb", 512, 1000.0, "/pool/left/dir_a/file2.nc"), + ("imohash:eee", 512, 1000.0, "/pool/left/dir_b/other.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + ("imohash:ccc", 512, 2000.0, "/pool/right/dir_x/file1.nc"), + ("imohash:ddd", 512, 2000.0, "/pool/right/dir_x/file2.nc"), + ("imohash:eee", 512, 1000.0, "/pool/right/dir_y/other.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 1 + left_dirs = {pair[0] for pair in suspicious} + assert any(d.endswith("dir_a") for d in left_dirs) + + def test_mix_identical_and_modified_not_flagged(self, tmp_path): + """A pair with at least one identical file should never be flagged.""" + shared_cs = "imohash:shared00" + left_csv = _make_csv_explicit(tmp_path, "left", [ + (shared_cs, 512, 1000.0, "/pool/left/dir_a/shared.nc"), + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/changed.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + (shared_cs, 512, 1000.0, "/pool/right/dir_x/shared.nc"), + ("imohash:bbb", 512, 2000.0, "/pool/right/dir_x/changed.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 0, ( + "dir_a/dir_x has an identical file — should not be flagged as suspicious" + ) + + def test_unique_only_dir_not_flagged(self, tmp_path): + """A left directory with no right counterpart (all unique) should not be flagged.""" + left_csv = _make_csv_explicit(tmp_path, "left", [ + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/only_left.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + ("imohash:bbb", 512, 1000.0, "/pool/right/dir_x/only_right.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 0 From b328b11285fd4f98e09dc9045ca84f0ecf3137d4 Mon Sep 17 00:00:00 2001 From: PavanSiligam Date: Wed, 13 May 2026 03:45:22 +0200 Subject: [PATCH 7/7] Document imohash false-negative risk with deterministic tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit imohash samples only 3×16 KB windows (start, middle, end) for files ≥ 128 KB, so two files that differ only in the unsampled region produce the same digest. Tests demonstrate: (1) the false-negative exists for large files, (2) small files are fully read and immune, (3) different file sizes never collide because size is encoded in the digest. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_imohash_false_negatives.py | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 tests/test_imohash_false_negatives.py diff --git a/tests/test_imohash_false_negatives.py b/tests/test_imohash_false_negatives.py new file mode 100644 index 0000000..92fa659 --- /dev/null +++ b/tests/test_imohash_false_negatives.py @@ -0,0 +1,65 @@ +import hashlib + +from imohash import hashfile + +SAMPLE_THRESHOLD = 128 * 1024 +SAMPLE_SIZE = 16 * 1024 + + +def _md5(path) -> str: + return hashlib.md5(path.read_bytes()).hexdigest() + + +class TestImohashFalseNegatives: + + def test_false_negative_large_files(self, tmp_path): + """Two large files identical in sampled regions but different in the + unsampled middle produce the same imohash — the known false-negative risk.""" + size = 512 * 1024 + + base = bytearray(size) + # First window: [0 : SAMPLE_SIZE] → [0 : 16 KB] + # Middle window: [size//2 : size//2+SAMPLE_SIZE] → [256 KB : 272 KB] + # End window: [size-SAMPLE_SIZE : size] → [496 KB : 512 KB] + # Unsampled gap: (16 KB, 256 KB) — safe to modify without touching any window + unsampled_offset = SAMPLE_SIZE + 1024 # 17 KB + + variant = bytearray(base) + variant[unsampled_offset] = 0xFF + + file_a = tmp_path / "file_a.bin" + file_b = tmp_path / "file_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + assert hashfile(str(file_a), hexdigest=True) == hashfile(str(file_b), hexdigest=True), ( + "imohash should produce the same digest for files differing only in the unsampled region" + ) + assert _md5(file_a) != _md5(file_b), "MD5 should correctly detect the difference" + + def test_no_false_negative_small_files(self, tmp_path): + """Files below SAMPLE_THRESHOLD are read fully — no false negatives.""" + size = SAMPLE_THRESHOLD - 1 + + base = bytearray(size) + variant = bytearray(base) + variant[100] = 0xFF + + file_a = tmp_path / "small_a.bin" + file_b = tmp_path / "small_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True), ( + "Small files are fully hashed — any difference must be detected" + ) + + def test_different_sizes_never_collide(self, tmp_path): + """imohash encodes file size in the digest, so files of different sizes + always produce different hashes regardless of content.""" + file_a = tmp_path / "size_a.bin" + file_b = tmp_path / "size_b.bin" + file_a.write_bytes(b"\x00" * (512 * 1024)) + file_b.write_bytes(b"\x00" * (512 * 1024 + 1)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True)