From 6e0a6422d6aad4755149b8934196652ddb398d95 Mon Sep 17 00:00:00 2001 From: Oliver Sherouse Date: Sun, 30 Nov 2025 10:51:43 -0500 Subject: [PATCH] fix: recover from cache corruption and update tooling Contributes to #78 Signed-off-by: Oliver Sherouse --- AGENTS.md | 38 ++++++++++++++++++++++---------- Makefile | 22 +++++++++++++++++++ mkdocs.yml | 2 +- tests/test_cache.py | 46 +++++++++++++++++++++++++++++++++++++++ wbdata/cache.py | 53 +++++++++++++++++++++++++++++++++++++-------- wbdata/client.py | 17 ++++++++++----- wbdata/version.py | 2 +- 7 files changed, 152 insertions(+), 28 deletions(-) create mode 100644 Makefile create mode 100644 tests/test_cache.py diff --git a/AGENTS.md b/AGENTS.md index 434bb78..3b6525c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,19 +1,33 @@ # Repository Guidelines -## Project Structure & Module Organization -Source lives in `wbdata/` with clients, caching helpers, and API utilities; `wbdata/version.py` centralizes the library version and should be the single source when updating releases. Tests reside in `tests/` with `test_*.py` modules mirroring public APIs. Contributor-facing docs and MkDocs content sit in `docs/`, while packaging metadata and tooling configuration are in `pyproject.toml`. +## Quickstart +- Run `make setup` to install all extras and dev tools. +- Run `make check` to execute format (check), lint, ty check, and tests. +- Open pull requests against `master`. +- Do not commit or push unless explicitly instructed. -## Build, Test, and Development Commands -Install dependencies with `uv sync --all-extras --group dev` so the docs, pandas extras, and developer tooling are available. Use `uv run pytest` for the default suite and coverage, matching the `--cov=wbdata` addopts in configuration. Run `uv run ruff check wbdata tests` to lint, and `uv run mypy wbdata` for type validation. During documentation work, serve the site locally via `uv run mkdocs serve`. +## Dev Loop +- Run `make format` (ruff format). +- Run `make lint` (ruff check wbdata tests). +- Run `make typecheck` (ty check wbdata). +- Run `make test` (pytest with coverage addopts from config). +- Preview docs with `uv run mkdocs serve`. -## Coding Style & Naming Conventions -Follow standard Python formatting with four-space indentation and readable, snake_case symbols. Public APIs exposed in `wbdata/__init__.py` should maintain descriptive, lowercase names; classes stay in CapWords. Ruff enforces PEP 8, import sorting, and selected Bugbear/Simplify rules—run it before committing. Keep modules typed, updating `py.typed` coverage when adding packages, and prefer explicit re-exports in `__all__` blocks where applicable. +## Project Layout +- Keep code in `wbdata/` (client, caching, API helpers); treat `wbdata/version.py` as the single source of version. +- Add tests in `tests/` with `test_*.py` mirroring public APIs. +- Maintain docs in `docs/` and `mkdocs.yml`; adjust packaging/tooling in `pyproject.toml`. -## Testing Guidelines -Write new tests under `tests/` using `pytest` conventions (`test_feature.py`, functions starting with `test_`). When adding network-heavy scenarios, leverage fixtures to isolate HTTP calls. Keep coverage from `pytest-cov` stable by exercising new branches, and include regression cases that mirror reported issues. +## Style +- Use Python 4-space indent, snake_case; CapWords for classes; re-export via `__all__` when needed. +- Run ruff (PEP8, imports, Bugbear/Simplify) before commits. +- Preserve typing coverage (`py.typed`); prefer explicit types. -## Commit & Pull Request Guidelines -Aim for concise, imperative subjects, optionally prefixed with Conventional Commit types as seen in `git log` (e.g., `fix: improve caching`). Reference related issues or discussions with `(#123)` in the subject when merging via GitHub. Before opening a PR, ensure lint, typing, and tests pass, document user-facing changes, and provide a short summary plus reproduction or screenshots when behavior shifts. +## Testing +- Add pytest cases under `tests/`; favor fixtures for network isolation. +- Maintain coverage by exercising new branches; include regression cases for reported bugs. -## Documentation & Release Notes -Update `docs/` pages when modifying user workflows, and verify the navigation using the MkDocs preview. Release metadata lives in `wbdata/version.py`; bump it in sync with changelog entries and confirm that packaging files (`pyproject.toml`, `MANIFEST.in`) need no extra updates. +## PR Expectations +- Use conventional, imperative titles (e.g., `fix: improve caching`). +- Ensure format/lint/type/tests pass; document user-facing changes; include repro or screenshots when behavior shifts. +- Bump `wbdata/version.py` alongside changelog/release notes when shipping releases. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..df4357a --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +setup: + uv sync --all-extras --group dev +.PHONY: setup + +format: + uv run ruff format wbdata tests docs +.PHONY: format + +lint: + uv run ruff check wbdata tests +.PHONY: lint + +typecheck: + uv run ty check wbdata +.PHONY: typecheck + +test: + uv run pytest +.PHONY: test + +check: format lint typecheck test +.PHONY: check diff --git a/mkdocs.yml b/mkdocs.yml index 5832bc0..520641a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -8,7 +8,7 @@ plugins: options: show_source: false members_order: source - docstrings_options: + docstring_options: returns_named_value: false returns_multiple_items: false watch: diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..13ac2ea --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,46 @@ +import wbdata.cache as cache + + +def test_get_cache_recovers_from_corruption(tmp_path, monkeypatch): + cache_path = tmp_path / "cachefile" + cache_path.write_bytes(b"corrupt") + + call_count = {"value": 0} + + class FakeCache: + def expire(self): + call_count["value"] += 1 + if call_count["value"] == 1: + raise SystemError("boom") + + monkeypatch.setattr( + cache.shelved_cache, "PersistentCache", lambda *args, **kwargs: FakeCache() + ) + + result = cache.get_cache(path=cache_path, ttl_days=1, max_size=1) + + assert isinstance(result, FakeCache) + assert call_count["value"] == 2 # retried after clearing corruption + assert not cache_path.exists() + + +def test_clear_cache_files_removes_shelve_variants(tmp_path): + base = tmp_path / "cachefile" + variants = [ + base, + base.with_suffix(".db"), + base.with_suffix(".dat"), + base.with_suffix(".dir"), + base.with_suffix(".bak"), + tmp_path / "cachefile.extra", + ] + for path in variants: + if path.suffix == ".dir": + path.mkdir() + else: + path.write_text("x") + + cache._clear_cache_files(base) + + for path in variants: + assert not path.exists() diff --git a/wbdata/cache.py b/wbdata/cache.py index 60fc272..cdcc17e 100644 --- a/wbdata/cache.py +++ b/wbdata/cache.py @@ -6,6 +6,8 @@ import datetime as dt import logging import os +import pickle +import shutil from pathlib import Path import appdirs @@ -65,16 +67,49 @@ def get_cache( `WBDATA_CACHE_MAX_SIZE`. """ - path = path or CACHE_PATH - Path(path).parent.mkdir(parents=True, exist_ok=True) + path = Path(path or CACHE_PATH) + path.parent.mkdir(parents=True, exist_ok=True) ttl_days = ttl_days or TTL_DAYS max_size = max_size or MAX_SIZE - cache = shelved_cache.PersistentCache( - cachetools.TTLCache, - filename=str(path), - maxsize=max_size, - ttl=dt.timedelta(days=ttl_days), - timer=dt.datetime.now, - ) + + def _build_cache() -> shelved_cache.PersistentCache: + return shelved_cache.PersistentCache( + cachetools.TTLCache, + filename=str(path), + maxsize=max_size, + ttl=dt.timedelta(days=ttl_days), + timer=dt.datetime.now, + ) + + try: + cache = _build_cache() + cache.expire() + return cache + except (SystemError, EOFError, pickle.UnpicklingError, OSError) as exc: + log.warning("Cache at %s failed to load (%s); recreating", path, exc) + _clear_cache_files(path) + + cache = _build_cache() cache.expire() return cache + + +def _clear_cache_files(path: Path) -> None: + """Remove shelve-backed cache files derived from *path*. + + Shelve implementations may create multiple files with suffixes (e.g. `.db`, + `.bak`, `.dat`, `.dir`). Remove the base file and any siblings sharing its + stem to ensure we start from a clean slate after corruption. + """ + + suffixes = (".db", ".bak", ".dat", ".dir") + candidates = {path} + candidates.update(path.parent.glob(f"{path.stem}.*")) + candidates.update({path.with_suffix(suffix) for suffix in suffixes}) + for candidate in candidates: + try: + candidate.unlink() + except FileNotFoundError: + continue + except (IsADirectoryError, PermissionError): + shutil.rmtree(candidate, ignore_errors=True) diff --git a/wbdata/client.py b/wbdata/client.py index f64e38f..471755e 100644 --- a/wbdata/client.py +++ b/wbdata/client.py @@ -8,7 +8,7 @@ import re from collections.abc import Generator, Iterable, Sequence from pathlib import Path -from typing import Any +from typing import Any, cast import decorator import requests @@ -471,11 +471,18 @@ def get_series( parse_dates=parse_dates, skip_cache=skip_cache, ) - df = pd.DataFrame( - [[i["country"]["value"], i["date"], i["value"]] for i in raw_data], - columns=["country", "date", name], + rows = cast( + list[tuple[str, str, float | None]], + [ + ( + str(i["country"]["value"]), + str(i["date"]), + _cast_float(i["value"]), + ) + for i in raw_data + ], ) - df[name] = df[name].map(_cast_float) + df = pd.DataFrame.from_records(rows, columns=["country", "date", name]) if not keep_levels and len(df["country"].unique()) == 1: df = df.set_index("date") elif not keep_levels and len(df["date"].unique()) == 1: diff --git a/wbdata/version.py b/wbdata/version.py index 6849410..a82b376 100644 --- a/wbdata/version.py +++ b/wbdata/version.py @@ -1 +1 @@ -__version__ = "1.1.0" +__version__ = "1.1.1"