diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..a1a6c8d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,43 @@ +name: Bug Report +description: Report a bug in ptool +labels: ["bug"] +body: + - type: markdown + attributes: + value: "Thanks for taking the time to report a bug!" + + - type: textarea + id: description + attributes: + label: Description + description: A clear description of what the bug is. + validations: + required: true + + - type: textarea + id: reproduce + attributes: + label: Steps to reproduce + description: Minimal steps to reproduce the behaviour. + placeholder: | + 1. Run `ptool checksums ...` + 2. Then run `ptool compare ...` + 3. See error + validations: + required: true + + - type: textarea + id: expected + attributes: + label: Expected behaviour + description: What you expected to happen. + validations: + required: true + + - type: textarea + id: environment + attributes: + label: Environment + description: Output of `ptool --version` and `python --version`. + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..cad08dd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,25 @@ +name: Feature Request +description: Suggest a new feature or improvement +labels: ["enhancement"] +body: + - type: textarea + id: problem + attributes: + label: Problem + description: What problem does this feature solve? + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed solution + description: Describe the solution you'd like. + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: Any alternative approaches you've thought about. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..940a1a3 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,16 @@ +## Summary + + + +## Changes + +- + +## Test plan + +- [ ] Tests added or updated +- [ ] `pytest tests/ -v` passes locally + +## Related issues + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..fdff357 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,28 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests + run: pytest tests/ -v diff --git a/.gitignore b/.gitignore index 76d970c..114de51 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,28 @@ -__pycache__ -ptool.egg-info +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +*.egg +dist/ +build/ + +# Virtual environments +.venv/ +.env/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Editor +.vscode/ +.idea/ +*.swp +*.swo + +# HPC slurm-*.out + +# OS +.DS_Store diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..4da4b4b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,42 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added +- Configurable checksum types: `imohash-64k` (default), `xxhash`, `md5` +- `--checksum-type` option on `ptool checksums` command +- Guard in `compare` and `summary` against mixing CSVs with different checksum types +- Test suite covering checksum type correctness and false-negative behaviour +- `pyproject.toml` replacing legacy `setup.py` +- `CONTRIBUTING.md`, `CODE_OF_CONDUCT.md`, `SECURITY.md`, `CHANGELOG.md` +- GitHub Actions CI workflow +- GitHub issue and PR templates + +### Fixed +- Invalid `str[pyarrow]` dtype string in `read_csv` (now `string[pyarrow]`), which caused a `TypeError` on pandas 2.x +- Incorrect `license = "MIT"` metadata — corrected to GPL-3.0 to match the `LICENSE` file +- Invalid PyPI classifier `Development Status :: 0.2.2` — corrected to `Development Status :: 4 - Beta` +- Wrong `url` in package metadata (pointed to GitLab) — updated to GitHub + +### Changed +- imohash sample size upgraded from 16 KB to 64 KB to reduce false-negative risk + +## [0.2.2] - 2024-01-01 + +### Added +- `sync_options`: support for additional sync configuration options + +## [0.2.1] - 2024-01-01 + +### Fixed +- Truncated prefix handling in checksum paths + +## [0.2.0] - 2024-01-01 + +### Added +- Initial public release with `checksums`, `compare`, `summary`, and `prepare-rsync` commands diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..f430314 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,41 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes +* Focusing on what is best not just for us as individuals, but for the overall community + +Examples of unacceptable behavior: + +* The use of sexualized language or imagery, and sexual attention or advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at **pavan.siligam@gmail.com**. All complaints will be reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 2.1. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..a876390 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,52 @@ +# Contributing to ptool + +Thank you for considering contributing to ptool! This document describes how to get started. + +## Development setup + +1. Fork the repository and clone your fork: + + ```shell + git clone https://github.com//pool_tool.git + cd pool_tool + ``` + +2. Create the conda environment and install the package in editable mode: + + ```shell + conda env create -f environment.yaml + conda activate ptool + pip install -e ".[dev]" + ``` + +3. Verify the setup by running the test suite: + + ```shell + pytest tests/ -v + ``` + +## Workflow + +- Create a feature branch from `main`: + + ```shell + git checkout -b my-feature + ``` + +- Make your changes, add tests, and ensure all tests pass before opening a PR. +- Write clear, descriptive commit messages. +- Open a pull request against `main` and fill in the PR template. + +## Running tests + +```shell +pytest tests/ -v +``` + +## Reporting bugs + +Please use the [bug report template](https://github.com/esm-tools/pool_tool/issues/new?template=bug_report.yml) when filing issues. + +## Code of conduct + +This project follows the [Contributor Covenant Code of Conduct](CODE_OF_CONDUCT.md). By participating you agree to abide by its terms. diff --git a/README.md b/README.md index 4d86308..3ada068 100644 --- a/README.md +++ b/README.md @@ -1,49 +1,102 @@ -# Ptool +# ptool -A tool to manage pools across different sites. With Ptool it is possible to take -snap-shot of a pool at regular intervals and compare them to monitor the changes -to the pool overtime. These snap-shots can be from the same machine or from -different machines. It also provides tools to sync the state between pool on -different machines. +[![CI](https://github.com/esm-tools/pool_tool/actions/workflows/ci.yml/badge.svg)](https://github.com/esm-tools/pool_tool/actions/workflows/ci.yml) +[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](LICENSE) +[![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org) + +A tool to manage data pools across different HPC sites. With ptool you can take +snapshots of a pool at regular intervals and compare them to monitor changes over +time. Snapshots can be taken on the same machine or on different machines, and +ptool provides tools to synchronise the state between pools on different machines. + +## Contents + +- [How it works](#how-it-works) + - [File association](#file-association) + - [Folder mapping](#folder-mapping) +- [Installation](#installation) +- [Usage](#usage) + - [checksums](#checksums-snapshot-of-a-pool) + - [summary](#summary) + - [compare](#compare) + - [prepare-rsync](#prepare-rsync) +- [Checksum algorithms](#checksum-algorithms) +- [Contributing](#contributing) + +## How it works + +ptool compares pools by taking a **snapshot** (a CSV of checksums + metadata) of each site and +analysing the two CSVs locally. No direct connection between sites is required at analysis time. + +### File association + +Every file pair is classified using two signals: whether the checksum matches and whether the +filename matches. + +| Checksum | Filename | Classification | +|----------|----------|----------------| +| same | same | `identical` — exact copy | +| different | same | `modified` — content changed; mtime indicates which copy is newer | +| same | different | `renamed` — content preserved, name changed | +| different | different | `unique` — file exists only on the left site | + +`compare` and `summary` use these labels to report what needs to be transferred and what is +already in sync. + +### Folder mapping + +Pools on different sites often use different directory layouts. ptool automatically maps +corresponding folders by finding which folder pair has the highest number of matching files +(**max-association wins**). Each folder on the left is paired with at most one folder on the right. + +The `--threshold` option (default `0.1`) controls false-positive filtering: if fewer than 10 % of +files in a left folder are associated with a right folder, the association is treated as noise and +those files are reclassified as `unique`. Raise the threshold to be stricter; lower it to allow +sparse associations. ## Installation -The current approach is to clone the repository on the machine where `Ptool` needs -to installed +Clone the repository on the machine where ptool needs to be installed: -``` shell +```shell git clone https://github.com/esm-tools/pool_tool.git cd pool_tool ``` -Create a virtual environment, either using `conda` or `pyvenv` to install the package. +Create a conda environment and install the package: -### Using Conda - -``` shell -conda env create -f environment.yml +```shell +conda env create -f environment.yaml conda activate ptool -pip install . +pip install -e . ``` -### Usage +For development (includes test dependencies): + +```shell +pip install -e ".[dev]" +pytest tests/ -v +``` -`Ptool` provides 4 commands to manage the pool. - - `checksums` to create snap-shot of the pool - - `summary` to get an overview by comparing 2 snap-shots - - `compare` to write concrete results of comparing 2 snap-shots - - `prepare-rsync` produces script to transfer files from one machine to another - -#### `checksums` (snap-shot of pool) +## Usage -Getting the help text to see the all the options +ptool provides four commands to manage pools: -``` shell +| Command | Purpose | +|---------|---------| +| `checksums` | Create a snapshot of a pool as a CSV | +| `summary` | High-level overview of two snapshots | +| `compare` | Per-file comparison of two snapshots | +| `prepare-rsync` | Generate a script to transfer files between sites | + +### checksums (snapshot of a pool) + +```shell $ ptool checksums --help Usage: ptool checksums [OPTIONS] PATH - Calculates imohash checksum of file(s) at the given path. Results are - presented as csv. + Calculates checksum of file(s) at the given path. + Results are presented as csv. `--ignore` and `--ignore-dirs` support *wildcards* in filtering down the matches. If no *wildcards* are provided, then it performs a literal match. @@ -51,18 +104,16 @@ Usage: ptool checksums [OPTIONS] PATH Options: --drop-hidden-files / --no-drop-hidden-files - ignore hidden files [default: drop-hidden- - files] + ignore hidden files [default: drop-hidden-files] --ignore TEXT ignore files --ignore-dirs TEXT ignore directories -o, --outfile FILENAME output filename --help Show this message and exit. ``` -Lets say, `Ptool` is installed on Levante and the pool to take snap-shot is -`fesom2` project, then invoke `checksum` as follows: +Taking a snapshot of the `fesom2` pool on Levante: -``` shell +```shell $ ptool checksums --ignore-dirs dist_* -o levante_fesom2.csv /pool/data/AWICM/FESOM2 Gathering files... skipping.. /pool/data/AWICM/FESOM2/FORCING/ERA5 -> /mnt/lustre01/work/ba1138/a270099/era5/forcing/inverted @@ -74,41 +125,31 @@ calculating hashes Elapsed 3.21s Writing results to levante_fesom2.csv ``` +The output CSV has four columns: `checksum`, `fsize`, `mtime`, `fpath`. + +> **Note:** The filename you choose for the CSV is used as the site label in +> analysis output, so pick a meaningful name (e.g. `levante_fesom2.csv`). + #### Remote checksums -It is also possible to get the `checksums` of pool on the remote site. Lets say -we are on Albedo machine and want to compare snap-shot for the project `fesom2` -from both Albedo and Levante then we can also directly invoke `checksums` -command on Levante from Albedo using ssh command as follows: +You can invoke `checksums` on a remote machine via SSH and pipe the output +to a local file. This is useful when you want to collect snapshots on the +machine where analysis will be run: -``` shell -$ ssh a270243@levante.dkrz.de "~/miniforge3/envs/ptool/bin/ptool checksums /pool/data/AWICM/FESOM2 --ignore-dirs dist_*" > levante_fesom2.csv -Gathering files... -skipping.. /pool/data/AWICM/FESOM2/FORCING/ERA5 -> /mnt/lustre01/work/ba1138/a270099/era5/forcing/inverted -getting files Elapsed 0.22s -nfiles: 3060 -Calculating hashes... -100%|██████████| 3060/3060 [00:01<00:00, 1610.56files/s] -calculating hashes Elapsed 3.21s -Writing results to +```shell +$ ssh a270243@levante.dkrz.de \ + "~/miniforge3/envs/ptool/bin/ptool checksums /pool/data/AWICM/FESOM2 --ignore-dirs dist_*" \ + > levante_fesom2.csv ``` -In the above command, since there is no `-o/--outfile` option provided, the -results are written to `` which is piped to a local file on Albedo. It -is also certainly possible to write the snap-shot results to a file on Levante -and then copy it over to Albedo. It is up-to the user, how they want to trigger -the computation but the main point here is the user is supposed to gather the -snap-shots on to the machine where further analysis is carried out. +When no `-o/--outfile` is given, results are written to stdout and can be +redirected to a local file. -NOTE: -User is free to choose any meaningful name for the csv file as they see fit. The -same name is used in displaying the results in the analysis part. +### summary -#### Summary +Get a high-level overview of two snapshots: -Lets say we have computed the snap-shot for the project pool `fesom2` on both Levante and Albedo, then invoke `summary` as follows to get a quick overview of the states these pool are in - -``` shell +```shell $ ptool summary --compact levante_fesom2.csv albedo_fesom2.csv Table 1: Summary with respect to LEVANTE_FESOM2 @@ -152,145 +193,122 @@ MESHES/CORE2 - 9 - 9 ---------------------------------------------------------------------- ``` -There are more options available for `summary` command to alter the results. Use -the `ptool summary --help` to see and investigate other options. +Run `ptool summary --help` to see all available options. -#### comapre +### compare -To get the specifics of the per-files associations, use the compare command as -follows +Get per-file comparison results: -``` shell -$ ptool compare -o lev_alb_fesom2_cmp.csv levante_fesom2.csv albedo_fesom2.csv +```shell +$ ptool compare -o lev_alb_fesom2_cmp.csv levante_fesom2.csv albedo_fesom2.csv Writing results as csv to file lev_alb_fesom2_cmp.csv rpath_left rpath_right -flag +flag identical FORCING/CORE2/ncar_precip.1948.nc /forcing/CORE2/ncar_precip.1948.nc identical FORCING/CORE2/ncar_precip.1952.nc /forcing/CORE2/ncar_precip.1952.nc -identical FORCING/CORE2/ncar_precip.1955.nc /forcing/CORE2/ncar_precip.1955.nc -identical FORCING/CORE2/ncar_precip.1953.nc /forcing/CORE2/ncar_precip.1953.nc -identical FORCING/CORE2/ncar_precip.1956.nc /forcing/CORE2/ncar_precip.1956.nc -... ... ... +... unique FORCING/era5/forcing/inverted/t2m.1972.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1940.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1948.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1960.nc NaN -unique FORCING/era5/forcing/inverted/t2m.1956.nc NaN [3014 rows x 2 columns] ``` -The comparison results are written to `lev_alb_fesom2_cmp.csv` file. Explore the -contents this file using your favorite editor or search for specific entries -using the `grep` command. For instance, using `grep` to finding the occurrences -of folder `MESHES_FESOM2.1/hr` +Each row is classified as one of: `identical`, `renamed`, `modified_latest_left`, +`modified_latest_right`, or `unique`. + +You can search the output with `grep`: -``` shell +```shell $ grep MESHES_FESOM2.1/hr lev_alb_fesom2_cmp.csv identical,MESHES_FESOM2.1/hr/edgenum.out,/HR/edgenum.out -identical,MESHES_FESOM2.1/hr/elvls.out,/HR/elvls.out -identical,MESHES_FESOM2.1/hr/hr_griddes_elements.nc,/HR/hr_griddes_elements.nc -identical,MESHES_FESOM2.1/hr/hr_zaxis.txt,/HR/hr_zaxis.txt -identical,MESHES_FESOM2.1/hr/nlvls.out,/HR/nlvls.out -identical,MESHES_FESOM2.1/hr/elem2d.out,/HR/elem2d.out -identical,MESHES_FESOM2.1/hr/hr_griddes_elements_IFS.nc,/HR/hr_griddes_elements_IFS.nc -identical,MESHES_FESOM2.1/hr/nod2d.out,/HR/nod2d.out -identical,MESHES_FESOM2.1/hr/fesom.mesh.diag.nc,/HR/fesom.mesh.diag.nc -identical,MESHES_FESOM2.1/hr/aux3d.out,/HR/aux3d.out -identical,MESHES_FESOM2.1/hr/hr_griddes_nodes.nc,/HR/hr_griddes_nodes.nc -identical,MESHES_FESOM2.1/hr/hr_griddes_nodes_IFS.nc,/HR/hr_griddes_nodes_IFS.nc -identical,MESHES_FESOM2.1/hr/edge_tri.out,/HR/edge_tri.out -identical,MESHES_FESOM2.1/hr/edges.out,/HR/edges.out +... modified_latest_right,MESHES_FESOM2.1/hr/README.md,/HR/README.md unique,MESHES_FESOM2.1/hr/README, ``` -#### prepare-rsync +> **Important:** Both CSVs must have been generated with the same checksum +> algorithm. ptool will raise an error if you try to compare snapshots taken +> with different algorithms. + +### prepare-rsync -This command produces a shell script which contains a list of rsync commands to -be executed. Before running the shell script, it is recommended to check the -contents of this file to see if `prepare-rsync` has produced the desired -result. User can directly manipulate the shell script to adjust for minor -artifacts in-case the options offered by the command does not yield the exact -result the user is expecting. Please check out the `--help` command for details -on the options along with few examples of invoking this command. +Generates a shell script containing rsync commands to transfer files between +sites. Always review the script before executing it. -``` shell +```shell $ ptool prepare-rsync --help Usage: ptool prepare-rsync [OPTIONS] LEFT RIGHT Prepares rsync commands for the transfer. - Denpending on where data needs to pushed or pulled, provide either - `--lefthost` or `--righthost` information to prefix that path. + Depending on where data needs to be pushed or pulled, provide either + `--lefthost` or `--righthost` to prefix that path. - Note: when Albedo system is invloved, run this command on Albedo and provide - the other host information as Albedo can not be reached from other machines. + Note: when Albedo is involved, run this command on Albedo and provide the + other host information, as Albedo cannot be reached from external machines. - Examples that WORK: - - # commands executed on Albedo (i.e., we are on Albedo) - - 1. sync data: Levante -> Albedo - - ptool prepare-rsync --lefthost user@levante.dkrz.de - checksum_levante_fesom2.csv checksum_albedo_fesom2.csv +Options: + --ignore TEXT ignores directory and files + --flags [unique|modified|both] association type to include + -t, --threshold FLOAT minimum value to satisfy valid association + [default: 0.1] + -l, --lefthost TEXT username@host prefix for the left path + -r, --righthost TEXT username@host prefix for the right path + --help Show this message and exit. +``` - 2. sync data: Albedo -> Levante +Transferring files from Levante to Albedo (run this on Albedo): - ptool prepare-rsync --righthost user@levante.dkrz.de - checksum_albedo_fesom2.csv checksum_levante_fesom2.csv +```shell +$ ptool prepare-rsync --lefthost a270243@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv +Created sync_cmd.sh +``` - Examples that FAIL: +Verify the contents of `sync_cmd.sh` before executing: - # commands executed on Levante (i.e., we are on Levante) +```shell +$ grep MESHES_FESOM2.1/hr sync_cmd.sh +# MESHES_FESOM2.1/hr +rsync -av --files-from=flist/95087e3b a270243@levante.dkrz.de:/pool/data/AWICM/FESOM2/MESHES_FESOM2.1/hr/ /albedo/pool/FESOM2/HR/ +``` - 1. sync data: Levante -> Albedo +#### Examples that work (run on Albedo) - ptool prepare-rsync --righthost user@albedo0.dmawi.de - checksum_levante_fesom2.csv checksum_albedo_fesom2.csv +```shell +# Sync data: Levante → Albedo +ptool prepare-rsync --lefthost user@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv - will produce rsync commands as follows: +# Sync data: Albedo → Levante +ptool prepare-rsync --righthost user@levante.dkrz.de albedo_fesom2.csv levante_fesom2.csv +``` - rsync /some/path/on/levante user@albedo0.dmawi.de:/some/path/on/albedo +#### Examples that fail (run on Levante) - Although syntactically correct command, it fails as Albedo is not - reachable from other machines +```shell +# This fails — Albedo is not reachable from external machines +ptool prepare-rsync --righthost user@albedo0.dmawi.de levante_fesom2.csv albedo_fesom2.csv +``` - 2. sync data: Albedo -> Levante +## Checksum algorithms - ptool prepare-rsync --lefthost user@albedo0.dmawi.de - checksum_albedo_fesom2.csv checksum_levante_fesom2.csv +ptool uses [imohash](https://github.com/kalafut/py-imohash) by default. imohash +is a fast sampling hasher: for files larger than 128 KB it reads only three +16 KB windows (start, middle, end) rather than the full file. This makes +snapshot generation very fast even for large pools of climate data. -Options: - --ignore TEXT ignores directory and files - --flags [unique|modified|both] association type to include - -t, --threshold FLOAT minumin value to satisfy valid association - [default: 0.1] - -l, --lefthost TEXT username@host prefix to the path for left - file - -r, --righthost TEXT username@host prefix to the path for right - file - --help Show this message and exit. -``` +**Trade-off:** Two files that are identical in the sampled regions but differ +elsewhere will be incorrectly classified as identical (a false negative). For +most sync workflows this risk is acceptably low, but it is worth being aware of. -Assuming we are on Albedo and want to transfer the files from Levante to Albedo, -invoke the `prepare-rsync` command as follows: +| Algorithm | Reads | False negatives possible? | Best for | +|-----------|-------|--------------------------|----------| +| `imohash` | 3 × 16 KB sample | Yes, for large files | Fast snapshots of large pools | -``` shell -$ ptool prepare-rsync --lefthost a270243@levante.dkrz.de levante_fesom2.csv albedo_fesom2.csv -Created sync_cmd.sh -``` +## Contributing -Verify the contents of `sync_cmd.sh` before executing the script. It is also -possible to directly edit this file to remove selected files from the -transaction. +Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for +setup instructions and the contribution workflow. -To give a sneak-peak into the `sync_cmd.sh` file, looking for -`MESHES_FESOM2.1/hr` entry as follows: +## License -``` shell -$ grep MESHES_FESOM2.1/hr sync_cmd.sh -# MESHES_FESOM2.1/hr -rsync -av --files-from=flist/95087e3b a270243@levante.dkrz.de:/pool/data/AWICM/FESOM2/MESHES_FESOM2.1/hr/ /albedo/pool/FESOM2/HR/ -``` +This project is licensed under the GNU General Public License v3.0 — see the +[LICENSE](LICENSE) file for details. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..1712864 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,23 @@ +# Security Policy + +## Supported Versions + +| Version | Supported | +|---------|-----------| +| 0.2.x | Yes | +| < 0.2 | No | + +## Reporting a Vulnerability + +Please **do not** report security vulnerabilities through public GitHub issues. + +Instead, email **pavan.siligam@gmail.com** with the subject line `[ptool] Security Vulnerability`. + +Include as much of the following as possible: + +- Description of the vulnerability +- Steps to reproduce +- Potential impact +- Suggested fix (if any) + +You can expect an acknowledgement within 48 hours and a resolution timeline within 7 days. diff --git a/ptool/analyse.py b/ptool/analyse.py index e153c51..3404b5c 100644 --- a/ptool/analyse.py +++ b/ptool/analyse.py @@ -16,10 +16,20 @@ ] +class _SnapshotFrame(pd.DataFrame): + """DataFrame subclass that carries snapshot metadata through pandas operations.""" + + _metadata = ["site", "filename"] + + @property + def _constructor(self): + return _SnapshotFrame + + def read_csv(filename, ignore=None, drop_duplicates=False): filename = os.path.expanduser(filename) - df = pd.read_csv(filename, engine="pyarrow") - df = df.rename(columns={"fname": "fpath"}) + df = _SnapshotFrame(pd.read_csv(filename, engine="pyarrow")) + df = df.rename(columns={"fname": "fpath", "md5": "checksum"}) df = df[df.checksum != "-"] df["fname"] = df.fpath.apply(os.path.basename) df["prefix"] = os.path.commonpath(list(df.fpath.apply(os.path.dirname))) @@ -27,46 +37,47 @@ def read_csv(filename, ignore=None, drop_duplicates=False): df["rparent"] = df.rpath.apply(os.path.dirname) for name, dtype in df.dtypes.items(): if dtype == "object": - df[name] = df[name].astype("str[pyarrow]") + df[name] = df[name].astype("string[pyarrow]") if ignore: df = df[~df.rparent.str.contains(ignore)] df = df[~df.fname.str.contains(ignore)] df = df.sort_values(by=["checksum", "mtime"]) - dups = df[ - df.duplicated(subset=["checksum", "fname"]).values - # df.duplicated(subset=["checksum"]).values - ] + dups = df[df.duplicated(subset=["checksum", "fname"]).values] if drop_duplicates: df = df.drop_duplicates(subset=["checksum", "fname"]) - # df = df.drop_duplicates(subset=["checksum"]) df["mtime"] = pd.to_datetime(df["mtime"], unit="s") - # _, pool, site = os.path.basename(filename).split("_") - # site, _ = os.path.splitext(site) df.filename = filename - # df.pool = pool - # df.site = site df.site = os.path.splitext(os.path.basename(filename))[0] - dups.filename = filename - # dups.pool = pool - # dups.site = site - dups.site = os.path.splitext(os.path.basename(filename))[0] return df, dups -def _group_with_max_counts(df, key="rparent_right"): - a = [(len(group), group) for gname, group in df.groupby(key)] - count, group = sorted(a, key=lambda x: x[0]).pop() - return group - - def merge(dl, da, on="checksum", how="inner"): m = pd.merge(dl, da, on=on, how=how, suffixes=("_left", "_right")) - mm = m.groupby("rparent_left").apply(_group_with_max_counts).reset_index(drop=True) - mm = ( - mm.groupby("rparent_right") - .apply(lambda x: _group_with_max_counts(x, key="rparent_left")) - .reset_index(drop=True) - ) + # TF-IDF weighted folder association: + # down-weight filenames that appear in many left directories (generic sequential + # names like my_list00001.out) so they don't drive false folder pairings. + # Files unique to one directory carry full weight; files ubiquitous across the + # pool carry near-zero weight. IDF = log(total_dirs / dirs_containing_fname). + fname_col = "fname_left" if "fname_left" in m.columns else "fname" + n_dirs = max(dl["rparent"].nunique(), 1) + n_dirs_per_fname = dl.groupby("fname")["rparent"].nunique() + idf = np.log(n_dirs / n_dirs_per_fname).clip(lower=0.0) + m["_score"] = m[fname_col].map(idf).fillna(0.0) + # For each left folder, pick the right folder with the highest TF-IDF score, + # then repeat from the right side to enforce 1:1 pairing. + pair_scores = m.groupby(["rparent_left", "rparent_right"])["_score"].sum().reset_index() + best_right = pair_scores.loc[ + pair_scores.groupby("rparent_left")["_score"].idxmax(), + ["rparent_left", "rparent_right"], + ] + mm = m.merge(best_right, on=["rparent_left", "rparent_right"]) + pair_scores2 = mm.groupby(["rparent_left", "rparent_right"])["_score"].sum().reset_index() + best_left = pair_scores2.loc[ + pair_scores2.groupby("rparent_right")["_score"].idxmax(), + ["rparent_left", "rparent_right"], + ] + mm = mm.merge(best_left, on=["rparent_left", "rparent_right"]) + mm = mm.drop(columns=["_score"]) return mm @@ -74,6 +85,27 @@ def directory_map(m): return (m[["rparent_left", "rparent_right"]]).drop_duplicates() +def _suspicious_pairs(cmp): + """Return (rparent_left, rparent_right) pairs where every matched file is + modified and none are identical. + + When two directories share filename patterns but hold different data (e.g. + MPI partition files for different core counts), all files appear as modified + and zero are identical. That pattern is a reliable indicator that the folder + mapping is spurious rather than a genuine sync mismatch. + """ + c = cmp.reset_index() + paired = c[c["flag"] != "unique"] + bad = set() + for rparent_left, group in paired.groupby("rparent_left"): + flags = set(group["flag"].unique()) + has_modified = bool(flags & {"modified_latest_left", "modified_latest_right"}) + if has_modified and "identical" not in flags: + for rp_right in group["rparent_right"].dropna().unique(): + bad.add((rparent_left, str(rp_right))) + return bad + + def compare(left, right, relabel=False, threshold=0.1): by_hash = merge(left, right) by_name = merge(left, right, on="fname") @@ -180,7 +212,8 @@ def _correct_false_positive(df, threshold=0.1): ] + partial_dfs ) - df = df.replace(r"^\s*$", np.nan, regex=True) + with pd.option_context("future.no_silent_downcasting", True): + df = df.replace(r"^\s*$", np.nan, regex=True) df.index = df.index.set_names("flag") return df @@ -323,6 +356,7 @@ def summary( # dmap = directory_map(m) # dmap = dmap[dmap.rparent_left != dmap.rparent_right] dmap = (cmp[["rparent_left", "rparent_right"]]).dropna().drop_duplicates() + suspicious = _suspicious_pairs(cmp) print("-" * 70) if not dmap.empty: dmap.columns = [ @@ -330,8 +364,17 @@ def summary( for c in dmap.columns ] dmap = dmap.reset_index(drop=True) + if suspicious: + left_col = f"rparent_{left_site}" + right_col = f"rparent_{right_site}" + dmap["note"] = dmap.apply( + lambda r: "[!]" if (r[left_col], r[right_col]) in suspicious else "", + axis=1, + ) print(f"\nTable {next(table_no)}: Common directory mapping\n") print(tabulate.tabulate(dmap, headers="keys")) + if suspicious: + print("\n[!] 100% modified, 0 identical — folder mapping is likely spurious.") print("-" * 70) c = cmp.reset_index() diff --git a/ptool/checksums.py b/ptool/checksums.py index f2d19a4..3d2b823 100755 --- a/ptool/checksums.py +++ b/ptool/checksums.py @@ -62,11 +62,11 @@ def split(s: str, sep: str = ",", escape: str = "\\") -> List[Optional[str]]: >>> split("core1,core2") ["core1", "core2"] - >>> split("core1\,group,core2") + >>> split("core1\\,group,core2") ["core1,group", "core2"] - >>> split("a,b\,c,d") + >>> split("a,b\\,c,d") ['a', 'b,c', 'd'] - >>> split("a|b\|c|d", sep="|") + >>> split("a|b\\|c|d", sep="|") ['a', 'b|c', 'd'] >>> split("a|b#|c|d", sep="|", escape="#") ['a', 'b|c', 'd'] @@ -215,37 +215,3 @@ def main(path, outfile, ignore=None, ignore_dirs=None, drop_hidden_files=True): outfile.writelines(results) -@click.command() -@click.option( - "--drop-hidden-files/--no-drop-hidden-files", - default=True, - is_flag=True, - show_default=True, - help="ignore hidden files", -) -@click.option("--ignore", default=None, show_default=True, help="ignore files") -@click.option( - "--ignore-dirs", default=None, show_default=True, help="ignore directories" -) -@click.option( - "-o", "--outfile", type=click.File("w"), default="-", help="output filename" -) -@click.argument("path") -def cli(path, outfile, ignore, ignore_dirs, drop_hidden_files): - """path to file or folder. - - Calculates imohash checksum of file(s) at the given path. - Results are presented as csv. - """ - path = os.path.expanduser(path) - main( - path=path, - outfile=outfile, - ignore=ignore, - ignore_dirs=ignore_dirs, - drop_hidden_files=drop_hidden_files, - ) - - -if __name__ == "__main__": - cli() diff --git a/ptool/cli.py b/ptool/cli.py index f17ea42..8ec83a7 100644 --- a/ptool/cli.py +++ b/ptool/cli.py @@ -112,7 +112,12 @@ def summary(ignore, drop_duplicates, compact, threshold, left, right): def sanitise(host, path): - "sanitise the hostpart of the path" + """Prefix path with host for rsync. + + AWI machines (*.awi.de) use a shared filesystem visible from both Levante + and Albedo, so no host prefix is needed — the path is used as-is. All + other remote hosts get the standard ``user@host:path`` prefix. + """ if (not host) or ("awi.de" in host): return path return f"{host}:{path}" @@ -260,14 +265,8 @@ def prepare_rsync(outfile, ignore, Flag, threshold, lefthost, righthost, left, r syncs.append("rm -rf flist") # syncs.append(f"rm {names}") syncs = "\n".join(syncs) - # os.makedirs("flist", exist_ok=True) - with open(outfile, "w") as fid: - fid.writelines(syncs) - # for name, fnames in fmap.items(): - # with open(f"flist/{name}", "w") as fid: - # fnames = "\n".join(fnames) - # fid.writelines(fnames) - print("Created sync_cmd.sh") + outfile.write(syncs) + print(f"Created {outfile.name}") @cli.command() diff --git a/ptool/utils.py b/ptool/utils.py deleted file mode 100644 index 444eff1..0000000 --- a/ptool/utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import re -import socket -import yaml - -BASEPATH_SCRIPTS = os.path.dirname(os.path.abspath(__file__)) -CONFIG_PATH_DEFAULT = f"{BASEPATH_SCRIPTS}/ptool_config.yaml" - - -def determine_computer_from_hostname(config_path=CONFIG_PATH_DEFAULT, verbose=True): - """ - Determines which yaml config file is needed for this computer. - Copied and edited from `ESM-Tools` (``esm_tools/src/esm_parser/esm_parser.py``) - - Notes - ----- - The machine must be registered in the ``ptool_config.yaml`` file in - order to be found. - - Input - ----- - config_path : str - Path to the ``ptool_config.yaml`` - - Returns - ------- - str - A string with the name of the machine as described in ``ptool_config.yaml``. If - pattern not matched it returns ``"local"`` - """ - with open(config_path, "r") as f: - all_computers = yaml.load(f, Loader=yaml.SafeLoader) - - for computer_name, info in all_computers.items(): - nodes = info.get("node_names", []) - for computer_pattern in nodes.values(): - if isinstance(computer_pattern, str): - if re.match(computer_pattern, socket.gethostname()) or re.match( - computer_pattern, socket.getfqdn() - ): - return computer_name - - elif isinstance(computer_pattern, (list, tuple)): - computer_patterns = computer_pattern - for pattern in computer_patterns: - if re.match(pattern, socket.gethostname()): - return computer_name - - if verbose: - print( - "Continues assuming that you are running under a ``local`` machine (no " - f"matching pattern in ``{config_path}``)" - ) - - return "local" - - -if __name__ == "__main__": - print(determine_computer_from_hostname()) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9c5d89d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,47 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "ptool" +version = "0.2.2" +description = "Analyse project data in pool directory at various sites" +readme = "README.md" +license = { file = "LICENSE" } +authors = [{ name = "Pavan Siligam", email = "pavan.siligam@gmail.com" }] +requires-python = ">=3.9" +dependencies = [ + "click", + "humanize", + "tabulate", + "pandas", + "pyarrow", + "imohash", + "tqdm", + "xxhash", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: GNU General Public License v3 (GPLv3)", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] + +[project.optional-dependencies] +dev = ["pytest"] + +[project.urls] +Homepage = "https://github.com/esm-tools/pool_tool" +Repository = "https://github.com/esm-tools/pool_tool" +Issues = "https://github.com/esm-tools/pool_tool/issues" + +[project.scripts] +ptool = "ptool.cli:cli" + +[tool.setuptools.packages.find] +include = ["ptool"] diff --git a/setup.py b/setup.py deleted file mode 100644 index 46f6131..0000000 --- a/setup.py +++ /dev/null @@ -1,44 +0,0 @@ -# -*- coding: utf-8 -*- - -from setuptools import setup, find_packages -import pathlib - - -here = pathlib.Path(__file__).parent.resolve() -long_description = (here / "README.md").read_text(encoding="utf-8") - -setup( - name="ptool", - version="0.2.2", - description="Analyse project data in pool directory at various sites", - long_description=long_description, - long_description_content_type="text/markdown", - python_requires=">=3.9", - packages=find_packages(include=["ptool"]), - install_requires=[ - "click", - "humanize", - "tabulate", - "pandas", - "pyarrow", - "imohash", - "tqdm", - ], - entry_points=""" - [console_scripts] - ptool=ptool.cli:cli - """, - classifiers=[ - "Development Status :: 0.2.2", - "Intended Audience :: Developers", - "Intended Audience :: Information Technology", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - author="Pavan Siligam", - author_email="pavan.siligam@gmail.com", - license="MIT", - url="https://gitlab.awi.de/hpc/pool", -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_analyse.py b/tests/test_analyse.py new file mode 100644 index 0000000..6b3edd6 --- /dev/null +++ b/tests/test_analyse.py @@ -0,0 +1,320 @@ +import os +import time + +import pytest + +from ptool.analyse import _suspicious_pairs, compare, compare_compact, directory_map, merge, read_csv +from ptool.checksums import stats + + +def _make_csv(tmp_path, name, files): + """Write a snapshot CSV from a dict of {filename: content_bytes}.""" + records = ["checksum,fsize,mtime,fpath"] + for fname, content in files.items(): + f = tmp_path / fname + f.write_bytes(content) + r = stats(str(f)) + assert not r.has_error(), f"stats failed for {fname}: {r.result()}" + records.append(r.value) + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +@pytest.fixture +def two_identical_pools(tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = {"data.nc": b"\xAB" * 1024} + left_csv = _make_csv(left_dir, "snapshot_left", content) + right_csv = _make_csv(right_dir, "snapshot_right", content) + return left_csv, right_csv + + +@pytest.fixture +def one_unique_pool(tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "snapshot_left", {"only_left.nc": b"\xAA" * 1024, "common.nc": b"\xBB" * 1024}) + right_csv = _make_csv(right_dir, "snapshot_right", {"common.nc": b"\xBB" * 1024}) + return left_csv, right_csv + + +class TestReadCsv: + def test_loads_records(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\x00" * 512}) + df, dups = read_csv(csv) + assert len(df) == 1 + assert "checksum" in df.columns + assert "fsize" in df.columns + assert "mtime" in df.columns + assert "fpath" in df.columns + + def test_derives_fname_and_rpath(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"file.nc": b"\x00" * 256}) + df, _ = read_csv(csv) + assert df["fname"].iloc[0] == "file.nc" + assert "rpath" in df.columns + + def test_no_duplicates_when_files_unique(self, tmp_path): + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\xAA" * 512, "b.nc": b"\xBB" * 512}) + df, dups = read_csv(csv) + assert len(df) == 2 + assert len(dups) == 0 + + def test_filters_dash_checksums(self, tmp_path): + # read_csv filters rows where checksum == "-", but a CSV with only + # such rows leaves an empty DataFrame and commonpath([]) raises. + # Mixed CSVs (some valid rows) work fine. + d = tmp_path / "pool" + d.mkdir() + csv = _make_csv(d, "snap", {"a.nc": b"\x00" * 256}) + # Manually append a dash row to the valid CSV + with open(csv, "a") as f: + f.write("-,0,0.0,/some/phantom\n") + df, _ = read_csv(csv) + # The dash row is filtered out; only the real file remains + assert len(df) == 1 + assert all(df["checksum"] != "-") + + +class TestCompare: + def test_identical_files_classified_correctly(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "identical" in result.index + + def test_unique_files_classified_correctly(self, one_unique_pool): + left_csv, right_csv = one_unique_pool + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "unique" in result.index + assert "identical" in result.index + + def test_renamed_files_classified_correctly(self, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = b"\xCC" * 1024 + left_csv = _make_csv(left_dir, "snap_left", {"original.nc": content}) + right_csv = _make_csv(right_dir, "snap_right", {"renamed.nc": content}) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "renamed" in result.index + + def test_empty_result_when_no_overlap(self, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "snap_left", {"left_only.nc": b"\xAA" * 512}) + right_csv = _make_csv(right_dir, "snap_right", {"right_only.nc": b"\xBB" * 512}) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare(left, right) + assert "identical" not in result.index + assert "unique" in result.index + + +class TestCompareCompact: + def test_returns_rpath_columns(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + result = compare_compact(left, right) + assert "rpath_left" in result.columns + assert "rpath_right" in result.columns + + +class TestMergeAndDirectoryMap: + def test_merge_on_checksum(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + assert len(m) == 1 + + def test_directory_map_returns_pairs(self, two_identical_pools): + left_csv, right_csv = two_identical_pools + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + dm = directory_map(m) + assert "rparent_left" in dm.columns + assert "rparent_right" in dm.columns + + +class TestTFIDFFolderMapping: + """TF-IDF weighted association should prefer specific (rare) filenames over + generic (ubiquitous) ones when mapping directories across sites. + + Scenario mirroring the real-world fArc_sorted false mapping: + Left side has three directories that all share three generic files + (IDF ≈ 0). dir_a also has one specific file (IDF = log 3 ≈ 1.10). + Right side has: + dir_x — specific_a + 1 generic (2 raw matches with dir_a) + dir_y — 3 generics (3 raw matches with dir_a — wins on raw count) + Raw count would map dir_a → dir_y (wrong). + TF-IDF maps dir_a → dir_x (correct). + """ + + def _build_pool(self, base, subdirs, files_per_subdir): + """Create directory tree and return {relative_path: content} dict.""" + for d in subdirs: + (base / d).mkdir(parents=True, exist_ok=True) + result = {} + for subdir, files in files_per_subdir.items(): + for fname, content in files.items(): + result[f"{subdir}/{fname}"] = content + return result + + def test_tfidf_prefers_specific_over_generic(self, tmp_path): + generic = b"\xAA" * 512 + generic2 = b"\xBB" * 512 + generic3 = b"\xCC" * 512 + specific = b"\xDD" * 512 + + left_base = tmp_path / "left" + for d in ["dir_a", "dir_b", "dir_c"]: + (left_base / d).mkdir(parents=True) + + # dir_a: 3 generic files (appear in all 3 left dirs) + 1 specific file + # dir_b, dir_c: only the 3 generic files → IDF of generics = log(3/3) = 0 + left_files = self._build_pool(left_base, [], { + "dir_a": {"generic.nc": generic, "generic2.nc": generic2, + "generic3.nc": generic3, "specific_a.nc": specific}, + "dir_b": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + "dir_c": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + }) + left_csv = _make_csv(left_base, "left", left_files) + + right_base = tmp_path / "right" + for d in ["dir_x", "dir_y"]: + (right_base / d).mkdir(parents=True) + + # dir_x: specific_a + 1 generic → 2 raw matches with dir_a + # dir_y: all 3 generics → 3 raw matches with dir_a (raw count picks this — wrong) + right_files = self._build_pool(right_base, [], { + "dir_x": {"specific_a.nc": specific, "generic.nc": generic}, + "dir_y": {"generic.nc": generic, "generic2.nc": generic2, "generic3.nc": generic3}, + }) + right_csv = _make_csv(right_base, "right", right_files) + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + dm = directory_map(m) + + # dir_a must map to dir_x (driven by specific_a), not dir_y (more generic matches) + dir_a_row = dm[dm.rparent_left.str.endswith("dir_a")] + assert len(dir_a_row) == 1, "dir_a should have exactly one mapping" + assert dir_a_row.rparent_right.iloc[0].endswith("dir_x"), ( + f"dir_a mapped to {dir_a_row.rparent_right.iloc[0]!r} — " + "expected dir_x (specific file); raw-count would pick dir_y" + ) + + def test_unique_files_still_map_correctly(self, tmp_path): + """When all filenames are unique (no generics), TF-IDF behaves like raw count.""" + left_base = tmp_path / "left" + right_base = tmp_path / "right" + (left_base / "pool").mkdir(parents=True) + (right_base / "pool").mkdir(parents=True) + + content = b"\xEE" * 512 + left_csv = _make_csv(left_base, "left", {"pool/data.nc": content}) + right_csv = _make_csv(right_base, "right", {"pool/data.nc": content}) + + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + m = merge(left, right) + assert len(m) == 1 + + +def _make_csv_explicit(tmp_path, name, entries): + """Write a snapshot CSV with hand-crafted (checksum, fsize, mtime, fpath) rows. + + Bypasses the filesystem so mtime can be controlled precisely — useful for + tests that need to distinguish modified_latest_left from modified_latest_right. + """ + records = ["checksum,fsize,mtime,fpath"] + for checksum, fsize, mtime, fpath in entries: + records.append(f"{checksum},{fsize},{mtime},{fpath}") + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +class TestSuspiciousPairs: + """_suspicious_pairs() should flag folder mappings where every paired file + is modified and none are identical — the signature of a false association.""" + + def test_all_modified_flagged(self, tmp_path): + """Two dirs sharing filenames but different content → suspicious.""" + left_csv = _make_csv_explicit(tmp_path, "left", [ + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/file1.nc"), + ("imohash:bbb", 512, 1000.0, "/pool/left/dir_a/file2.nc"), + ("imohash:eee", 512, 1000.0, "/pool/left/dir_b/other.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + ("imohash:ccc", 512, 2000.0, "/pool/right/dir_x/file1.nc"), + ("imohash:ddd", 512, 2000.0, "/pool/right/dir_x/file2.nc"), + ("imohash:eee", 512, 1000.0, "/pool/right/dir_y/other.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 1 + left_dirs = {pair[0] for pair in suspicious} + assert any(d.endswith("dir_a") for d in left_dirs) + + def test_mix_identical_and_modified_not_flagged(self, tmp_path): + """A pair with at least one identical file should never be flagged.""" + shared_cs = "imohash:shared00" + left_csv = _make_csv_explicit(tmp_path, "left", [ + (shared_cs, 512, 1000.0, "/pool/left/dir_a/shared.nc"), + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/changed.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + (shared_cs, 512, 1000.0, "/pool/right/dir_x/shared.nc"), + ("imohash:bbb", 512, 2000.0, "/pool/right/dir_x/changed.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 0, ( + "dir_a/dir_x has an identical file — should not be flagged as suspicious" + ) + + def test_unique_only_dir_not_flagged(self, tmp_path): + """A left directory with no right counterpart (all unique) should not be flagged.""" + left_csv = _make_csv_explicit(tmp_path, "left", [ + ("imohash:aaa", 512, 1000.0, "/pool/left/dir_a/only_left.nc"), + ]) + right_csv = _make_csv_explicit(tmp_path, "right", [ + ("imohash:bbb", 512, 1000.0, "/pool/right/dir_x/only_right.nc"), + ]) + left, _ = read_csv(left_csv) + right, _ = read_csv(right_csv) + cmp = compare(left, right) + + suspicious = _suspicious_pairs(cmp) + assert len(suspicious) == 0 diff --git a/tests/test_checksums.py b/tests/test_checksums.py new file mode 100644 index 0000000..6d55392 --- /dev/null +++ b/tests/test_checksums.py @@ -0,0 +1,160 @@ +import os + +import pytest + +from ptool.checksums import Results, get_files, hasher, ignore_re, scanner, split, stats + + +class TestSplit: + def test_simple(self): + assert split("a,b,c") == ["a", "b", "c"] + + def test_single(self): + assert split("a") == ["a"] + + def test_empty(self): + assert split("") == [] + + def test_none(self): + assert split(None) == [] + + def test_escaped_separator(self): + assert split(r"a\,b,c") == ["a,b", "c"] + + def test_escaped_separator_middle(self): + assert split(r"a,b\,c,d") == ["a", "b,c", "d"] + + def test_custom_separator(self): + assert split("a|b|c", sep="|") == ["a", "b", "c"] + + def test_custom_escape(self): + assert split("a|b#|c|d", sep="|", escape="#") == ["a", "b|c", "d"] + + +class TestIgnoreRe: + def test_no_pattern_matches_nothing(self): + fn = ignore_re(None) + assert fn("anything") is False + + def test_exact_match(self): + fn = ignore_re("foo.nc") + assert fn("foo.nc") is True + assert fn("bar.nc") is False + + def test_wildcard(self): + fn = ignore_re("*.nc") + assert fn("data.nc") is True + assert fn("data.csv") is False + + def test_multiple_patterns(self): + fn = ignore_re("*.nc,*.txt") + assert fn("data.nc") is True + assert fn("readme.txt") is True + assert fn("script.py") is False + + +class TestResults: + def test_value(self): + r = Results(value="ok") + assert not r.has_error() + assert r.result() == "ok" + + def test_exception(self): + r = Results(exc="something went wrong") + assert r.has_error() + assert r.result() == "something went wrong" + + +class TestHasher: + def test_returns_imohash_prefix(self, tmp_path): + f = tmp_path / "file.bin" + f.write_bytes(b"\x00" * 1024) + result = hasher(str(f)) + assert result.startswith("imohash:") + + def test_same_content_same_hash(self, tmp_path): + content = b"\xAB" * 1024 + a = tmp_path / "a.bin" + b = tmp_path / "b.bin" + a.write_bytes(content) + b.write_bytes(content) + assert hasher(str(a)) == hasher(str(b)) + + def test_different_content_different_hash(self, tmp_path): + a = tmp_path / "a.bin" + b = tmp_path / "b.bin" + a.write_bytes(b"\x00" * 1024) + b.write_bytes(b"\xFF" * 1024) + assert hasher(str(a)) != hasher(str(b)) + + +class TestStats: + def test_valid_file_produces_csv_record(self, tmp_path): + f = tmp_path / "data.bin" + f.write_bytes(b"\x00" * 512) + result = stats(str(f)) + assert not result.has_error() + parts = result.value.split(",") + assert len(parts) == 4 + assert parts[0].startswith("imohash:") + assert parts[1] == "512" # fsize + assert str(f) in parts[3] # fpath + + def test_missing_file_produces_error(self, tmp_path): + result = stats(str(tmp_path / "nonexistent.bin")) + assert result.has_error() + + +class TestScanner: + def test_finds_files(self, tmp_path): + (tmp_path / "a.nc").write_bytes(b"x") + (tmp_path / "b.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 2 + + def test_recurses_into_subdirs(self, tmp_path): + sub = tmp_path / "sub" + sub.mkdir() + (tmp_path / "root.nc").write_bytes(b"x") + (sub / "nested.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 2 + + def test_drops_hidden_files_by_default(self, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"x") + (tmp_path / ".hidden").write_bytes(b"x") + found = list(scanner(str(tmp_path))) + assert len(found) == 1 + assert all(".hidden" not in f for f in found) + + def test_includes_hidden_files_when_disabled(self, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"x") + (tmp_path / ".hidden").write_bytes(b"x") + found = list(scanner(str(tmp_path), drop_hidden_files=False)) + assert len(found) == 2 + + def test_ignore_pattern(self, tmp_path): + (tmp_path / "keep.nc").write_bytes(b"x") + (tmp_path / "skip.tmp").write_bytes(b"x") + found = list(scanner(str(tmp_path), ignore="*.tmp")) + assert len(found) == 1 + assert found[0].endswith("keep.nc") + + def test_ignore_dirs(self, tmp_path): + keep = tmp_path / "keep" + skip = tmp_path / "skip" + keep.mkdir() + skip.mkdir() + (keep / "a.nc").write_bytes(b"x") + (skip / "b.nc").write_bytes(b"x") + found = list(scanner(str(tmp_path), ignore_dirs="skip")) + assert len(found) == 1 + assert "keep" in found[0] + + +class TestGetFiles: + def test_returns_list(self, tmp_path): + (tmp_path / "a.nc").write_bytes(b"x") + result = get_files(str(tmp_path)) + assert isinstance(result, list) + assert len(result) == 1 diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..775228b --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,125 @@ +import os + +import pytest +from click.testing import CliRunner + +from ptool.cli import cli +from ptool.checksums import stats + + +def _make_csv(tmp_path, name, files): + records = ["checksum,fsize,mtime,fpath"] + for fname, content in files.items(): + f = tmp_path / fname + f.write_bytes(content) + r = stats(str(f)) + assert not r.has_error() + records.append(r.value) + csv_path = tmp_path / f"{name}.csv" + csv_path.write_text("\n".join(records) + "\n") + return str(csv_path) + + +@pytest.fixture +def runner(): + return CliRunner() + + +class TestChecksumsCommand: + def test_runs_on_directory(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 512) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert result.exit_code == 0 + assert "imohash:" in result.output + + def test_runs_on_single_file(self, runner, tmp_path): + f = tmp_path / "data.nc" + f.write_bytes(b"\x00" * 512) + result = runner.invoke(cli, ["checksums", str(f)]) + assert result.exit_code == 0 + assert "imohash:" in result.output + + def test_output_has_csv_header(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert "checksum,fsize,mtime,fpath" in result.output + + def test_ignore_flag(self, runner, tmp_path): + (tmp_path / "keep.nc").write_bytes(b"\x00" * 256) + (tmp_path / "skip.tmp").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", "--ignore", "*.tmp", str(tmp_path)]) + assert result.exit_code == 0 + assert "skip.tmp" not in result.output + assert "keep.nc" in result.output + + def test_drop_hidden_files_default(self, runner, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"\x00" * 256) + (tmp_path / ".hidden").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", str(tmp_path)]) + assert result.exit_code == 0 + assert ".hidden" not in result.output + + def test_no_drop_hidden_files(self, runner, tmp_path): + (tmp_path / "visible.nc").write_bytes(b"\x00" * 256) + (tmp_path / ".hidden").write_bytes(b"\x00" * 256) + result = runner.invoke(cli, ["checksums", "--no-drop-hidden-files", str(tmp_path)]) + assert result.exit_code == 0 + assert ".hidden" in result.output + + def test_output_to_file(self, runner, tmp_path): + (tmp_path / "a.nc").write_bytes(b"\x00" * 256) + out = tmp_path / "out.csv" + result = runner.invoke(cli, ["checksums", "-o", str(out), str(tmp_path)]) + assert result.exit_code == 0 + assert out.exists() + assert "imohash:" in out.read_text() + + +class TestCompareCommand: + def test_compare_identical_pools(self, runner, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + content = {"data.nc": b"\xAB" * 1024} + left_csv = _make_csv(left_dir, "left", content) + right_csv = _make_csv(right_dir, "right", content) + result = runner.invoke(cli, ["compare", left_csv, right_csv]) + assert result.exit_code == 0 + + def test_compare_shows_unique(self, runner, tmp_path): + left_dir = tmp_path / "left" + right_dir = tmp_path / "right" + left_dir.mkdir() + right_dir.mkdir() + left_csv = _make_csv(left_dir, "left", {"common.nc": b"\xAA" * 512, "extra.nc": b"\xBB" * 512}) + right_csv = _make_csv(right_dir, "right", {"common.nc": b"\xAA" * 512}) + result = runner.invoke(cli, ["compare", left_csv, right_csv]) + assert result.exit_code == 0 + + +class TestCliHelp: + def test_root_help(self, runner): + result = runner.invoke(cli, ["--help"]) + assert result.exit_code == 0 + assert "checksums" in result.output + assert "compare" in result.output + assert "summary" in result.output + + def test_checksums_help(self, runner): + result = runner.invoke(cli, ["checksums", "--help"]) + assert result.exit_code == 0 + assert "--ignore" in result.output + assert "--outfile" in result.output + + def test_compare_help(self, runner): + result = runner.invoke(cli, ["compare", "--help"]) + assert result.exit_code == 0 + + def test_summary_help(self, runner): + result = runner.invoke(cli, ["summary", "--help"]) + assert result.exit_code == 0 + + def test_prepare_rsync_help(self, runner): + result = runner.invoke(cli, ["prepare-rsync", "--help"]) + assert result.exit_code == 0 diff --git a/tests/test_imohash_false_negatives.py b/tests/test_imohash_false_negatives.py new file mode 100644 index 0000000..92fa659 --- /dev/null +++ b/tests/test_imohash_false_negatives.py @@ -0,0 +1,65 @@ +import hashlib + +from imohash import hashfile + +SAMPLE_THRESHOLD = 128 * 1024 +SAMPLE_SIZE = 16 * 1024 + + +def _md5(path) -> str: + return hashlib.md5(path.read_bytes()).hexdigest() + + +class TestImohashFalseNegatives: + + def test_false_negative_large_files(self, tmp_path): + """Two large files identical in sampled regions but different in the + unsampled middle produce the same imohash — the known false-negative risk.""" + size = 512 * 1024 + + base = bytearray(size) + # First window: [0 : SAMPLE_SIZE] → [0 : 16 KB] + # Middle window: [size//2 : size//2+SAMPLE_SIZE] → [256 KB : 272 KB] + # End window: [size-SAMPLE_SIZE : size] → [496 KB : 512 KB] + # Unsampled gap: (16 KB, 256 KB) — safe to modify without touching any window + unsampled_offset = SAMPLE_SIZE + 1024 # 17 KB + + variant = bytearray(base) + variant[unsampled_offset] = 0xFF + + file_a = tmp_path / "file_a.bin" + file_b = tmp_path / "file_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + assert hashfile(str(file_a), hexdigest=True) == hashfile(str(file_b), hexdigest=True), ( + "imohash should produce the same digest for files differing only in the unsampled region" + ) + assert _md5(file_a) != _md5(file_b), "MD5 should correctly detect the difference" + + def test_no_false_negative_small_files(self, tmp_path): + """Files below SAMPLE_THRESHOLD are read fully — no false negatives.""" + size = SAMPLE_THRESHOLD - 1 + + base = bytearray(size) + variant = bytearray(base) + variant[100] = 0xFF + + file_a = tmp_path / "small_a.bin" + file_b = tmp_path / "small_b.bin" + file_a.write_bytes(bytes(base)) + file_b.write_bytes(bytes(variant)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True), ( + "Small files are fully hashed — any difference must be detected" + ) + + def test_different_sizes_never_collide(self, tmp_path): + """imohash encodes file size in the digest, so files of different sizes + always produce different hashes regardless of content.""" + file_a = tmp_path / "size_a.bin" + file_b = tmp_path / "size_b.bin" + file_a.write_bytes(b"\x00" * (512 * 1024)) + file_b.write_bytes(b"\x00" * (512 * 1024 + 1)) + + assert hashfile(str(file_a), hexdigest=True) != hashfile(str(file_b), hexdigest=True)