From 6e0a6422d6aad4755149b8934196652ddb398d95 Mon Sep 17 00:00:00 2001
From: Oliver Sherouse <oliver@oliversherouse.com>
Date: Sun, 30 Nov 2025 10:51:43 -0500
Subject: [PATCH] fix: recover from cache corruption and update tooling

Contributes to #78

Signed-off-by: Oliver Sherouse <oliver@oliversherouse.com>
---
 AGENTS.md           | 38 ++++++++++++++++++++++----------
 Makefile            | 22 +++++++++++++++++++
 mkdocs.yml          |  2 +-
 tests/test_cache.py | 46 +++++++++++++++++++++++++++++++++++++++
 wbdata/cache.py     | 53 +++++++++++++++++++++++++++++++++++++--------
 wbdata/client.py    | 17 ++++++++++-----
 wbdata/version.py   |  2 +-
 7 files changed, 152 insertions(+), 28 deletions(-)
 create mode 100644 Makefile
 create mode 100644 tests/test_cache.py

diff --git a/AGENTS.md b/AGENTS.md
index 434bb78..3b6525c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,19 +1,33 @@
 # Repository Guidelines
 
-## Project Structure & Module Organization
-Source lives in `wbdata/` with clients, caching helpers, and API utilities; `wbdata/version.py` centralizes the library version and should be the single source when updating releases. Tests reside in `tests/` with `test_*.py` modules mirroring public APIs. Contributor-facing docs and MkDocs content sit in `docs/`, while packaging metadata and tooling configuration are in `pyproject.toml`.
+## Quickstart
+- Run `make setup` to install all extras and dev tools.
+- Run `make check` to execute format (check), lint, ty check, and tests.
+- Open pull requests against `master`.
+- Do not commit or push unless explicitly instructed.
 
-## Build, Test, and Development Commands
-Install dependencies with `uv sync --all-extras --group dev` so the docs, pandas extras, and developer tooling are available. Use `uv run pytest` for the default suite and coverage, matching the `--cov=wbdata` addopts in configuration. Run `uv run ruff check wbdata tests` to lint, and `uv run mypy wbdata` for type validation. During documentation work, serve the site locally via `uv run mkdocs serve`.
+## Dev Loop
+- Run `make format` (ruff format).
+- Run `make lint` (ruff check wbdata tests).
+- Run `make typecheck` (ty check wbdata).
+- Run `make test` (pytest with coverage addopts from config).
+- Preview docs with `uv run mkdocs serve`.
 
-## Coding Style & Naming Conventions
-Follow standard Python formatting with four-space indentation and readable, snake_case symbols. Public APIs exposed in `wbdata/__init__.py` should maintain descriptive, lowercase names; classes stay in CapWords. Ruff enforces PEP 8, import sorting, and selected Bugbear/Simplify rules—run it before committing. Keep modules typed, updating `py.typed` coverage when adding packages, and prefer explicit re-exports in `__all__` blocks where applicable.
+## Project Layout
+- Keep code in `wbdata/` (client, caching, API helpers); treat `wbdata/version.py` as the single source of version.
+- Add tests in `tests/` with `test_*.py` mirroring public APIs.
+- Maintain docs in `docs/` and `mkdocs.yml`; adjust packaging/tooling in `pyproject.toml`.
 
-## Testing Guidelines
-Write new tests under `tests/` using `pytest` conventions (`test_feature.py`, functions starting with `test_`). When adding network-heavy scenarios, leverage fixtures to isolate HTTP calls. Keep coverage from `pytest-cov` stable by exercising new branches, and include regression cases that mirror reported issues.
+## Style
+- Use Python 4-space indent, snake_case; CapWords for classes; re-export via `__all__` when needed.
+- Run ruff (PEP8, imports, Bugbear/Simplify) before commits.
+- Preserve typing coverage (`py.typed`); prefer explicit types.
 
-## Commit & Pull Request Guidelines
-Aim for concise, imperative subjects, optionally prefixed with Conventional Commit types as seen in `git log` (e.g., `fix: improve caching`). Reference related issues or discussions with `(#123)` in the subject when merging via GitHub. Before opening a PR, ensure lint, typing, and tests pass, document user-facing changes, and provide a short summary plus reproduction or screenshots when behavior shifts.
+## Testing
+- Add pytest cases under `tests/`; favor fixtures for network isolation.
+- Maintain coverage by exercising new branches; include regression cases for reported bugs.
 
-## Documentation & Release Notes
-Update `docs/` pages when modifying user workflows, and verify the navigation using the MkDocs preview. Release metadata lives in `wbdata/version.py`; bump it in sync with changelog entries and confirm that packaging files (`pyproject.toml`, `MANIFEST.in`) need no extra updates.
+## PR Expectations
+- Use conventional, imperative titles (e.g., `fix: improve caching`).
+- Ensure format/lint/type/tests pass; document user-facing changes; include repro or screenshots when behavior shifts.
+- Bump `wbdata/version.py` alongside changelog/release notes when shipping releases.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..df4357a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,22 @@
+setup:
+	uv sync --all-extras --group dev
+.PHONY: setup
+
+format:
+	uv run ruff format wbdata tests docs
+.PHONY: format
+
+lint:
+	uv run ruff check wbdata tests
+.PHONY: lint
+
+typecheck:
+	uv run ty check wbdata
+.PHONY: typecheck
+
+test:
+	uv run pytest
+.PHONY: test
+
+check: format lint typecheck test
+.PHONY: check
diff --git a/mkdocs.yml b/mkdocs.yml
index 5832bc0..520641a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -8,7 +8,7 @@ plugins:
           options:
             show_source: false
             members_order: source
-            docstrings_options:
+            docstring_options:
               returns_named_value: false
               returns_multiple_items: false
 watch:
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..13ac2ea
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,46 @@
+import wbdata.cache as cache
+
+
+def test_get_cache_recovers_from_corruption(tmp_path, monkeypatch):
+    cache_path = tmp_path / "cachefile"
+    cache_path.write_bytes(b"corrupt")
+
+    call_count = {"value": 0}
+
+    class FakeCache:
+        def expire(self):
+            call_count["value"] += 1
+            if call_count["value"] == 1:
+                raise SystemError("boom")
+
+    monkeypatch.setattr(
+        cache.shelved_cache, "PersistentCache", lambda *args, **kwargs: FakeCache()
+    )
+
+    result = cache.get_cache(path=cache_path, ttl_days=1, max_size=1)
+
+    assert isinstance(result, FakeCache)
+    assert call_count["value"] == 2  # retried after clearing corruption
+    assert not cache_path.exists()
+
+
+def test_clear_cache_files_removes_shelve_variants(tmp_path):
+    base = tmp_path / "cachefile"
+    variants = [
+        base,
+        base.with_suffix(".db"),
+        base.with_suffix(".dat"),
+        base.with_suffix(".dir"),
+        base.with_suffix(".bak"),
+        tmp_path / "cachefile.extra",
+    ]
+    for path in variants:
+        if path.suffix == ".dir":
+            path.mkdir()
+        else:
+            path.write_text("x")
+
+    cache._clear_cache_files(base)
+
+    for path in variants:
+        assert not path.exists()
diff --git a/wbdata/cache.py b/wbdata/cache.py
index 60fc272..cdcc17e 100644
--- a/wbdata/cache.py
+++ b/wbdata/cache.py
@@ -6,6 +6,8 @@
 import datetime as dt
 import logging
 import os
+import pickle
+import shutil
 from pathlib import Path
 
 import appdirs
@@ -65,16 +67,49 @@ def get_cache(
             `WBDATA_CACHE_MAX_SIZE`.
 
     """
-    path = path or CACHE_PATH
-    Path(path).parent.mkdir(parents=True, exist_ok=True)
+    path = Path(path or CACHE_PATH)
+    path.parent.mkdir(parents=True, exist_ok=True)
     ttl_days = ttl_days or TTL_DAYS
     max_size = max_size or MAX_SIZE
-    cache = shelved_cache.PersistentCache(
-        cachetools.TTLCache,
-        filename=str(path),
-        maxsize=max_size,
-        ttl=dt.timedelta(days=ttl_days),
-        timer=dt.datetime.now,
-    )
+
+    def _build_cache() -> shelved_cache.PersistentCache:
+        return shelved_cache.PersistentCache(
+            cachetools.TTLCache,
+            filename=str(path),
+            maxsize=max_size,
+            ttl=dt.timedelta(days=ttl_days),
+            timer=dt.datetime.now,
+        )
+
+    try:
+        cache = _build_cache()
+        cache.expire()
+        return cache
+    except (SystemError, EOFError, pickle.UnpicklingError, OSError) as exc:
+        log.warning("Cache at %s failed to load (%s); recreating", path, exc)
+        _clear_cache_files(path)
+
+    cache = _build_cache()
     cache.expire()
     return cache
+
+
+def _clear_cache_files(path: Path) -> None:
+    """Remove shelve-backed cache files derived from *path*.
+
+    Shelve implementations may create multiple files with suffixes (e.g. `.db`,
+    `.bak`, `.dat`, `.dir`). Remove the base file and any siblings sharing its
+    stem to ensure we start from a clean slate after corruption.
+    """
+
+    suffixes = (".db", ".bak", ".dat", ".dir")
+    candidates = {path}
+    candidates.update(path.parent.glob(f"{path.stem}.*"))
+    candidates.update({path.with_suffix(suffix) for suffix in suffixes})
+    for candidate in candidates:
+        try:
+            candidate.unlink()
+        except FileNotFoundError:
+            continue
+        except (IsADirectoryError, PermissionError):
+            shutil.rmtree(candidate, ignore_errors=True)
diff --git a/wbdata/client.py b/wbdata/client.py
index f64e38f..471755e 100644
--- a/wbdata/client.py
+++ b/wbdata/client.py
@@ -8,7 +8,7 @@
 import re
 from collections.abc import Generator, Iterable, Sequence
 from pathlib import Path
-from typing import Any
+from typing import Any, cast
 
 import decorator
 import requests
@@ -471,11 +471,18 @@ def get_series(
             parse_dates=parse_dates,
             skip_cache=skip_cache,
         )
-        df = pd.DataFrame(
-            [[i["country"]["value"], i["date"], i["value"]] for i in raw_data],
-            columns=["country", "date", name],
+        rows = cast(
+            list[tuple[str, str, float | None]],
+            [
+                (
+                    str(i["country"]["value"]),
+                    str(i["date"]),
+                    _cast_float(i["value"]),
+                )
+                for i in raw_data
+            ],
         )
-        df[name] = df[name].map(_cast_float)
+        df = pd.DataFrame.from_records(rows, columns=["country", "date", name])
         if not keep_levels and len(df["country"].unique()) == 1:
             df = df.set_index("date")
         elif not keep_levels and len(df["date"].unique()) == 1:
diff --git a/wbdata/version.py b/wbdata/version.py
index 6849410..a82b376 100644
--- a/wbdata/version.py
+++ b/wbdata/version.py
@@ -1 +1 @@
-__version__ = "1.1.0"
+__version__ = "1.1.1"