diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 4ac88e4..c738431 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -27,6 +27,38 @@ jobs: with: fetch-depth: 0 + - uses: actions/setup-python@v5 + with: + python-version: "3.x" + + - name: Read idc-index-data version from pyproject.toml + run: | + python - <<'PY' > /tmp/idc_ver.env + import tomllib, re + with open('pyproject.toml','rb') as f: + data = tomllib.load(f) + ver = None + for d in data['project']['dependencies']: + m = re.match(r'idc-index-data==(.+)', d) + if m: + ver = m.group(1) + break + assert ver, 'idc-index-data version not found in pyproject.toml' + print(f'IDC_INDEX_DATA_VERSION={ver}') + PY + cat /tmp/idc_ver.env >> $GITHUB_ENV + + - name: Preinstall idc-index-data for cache generation + env: + IDC_INDEX_DATA_VERSION: ${{ env.IDC_INDEX_DATA_VERSION }} + run: | + python -m pip install --upgrade pip + python -m pip install "idc-index-data==${IDC_INDEX_DATA_VERSION}" + + - name: Generate bundled indices cache + run: | + python scripts/generate_indices_cache.py + - uses: hynek/build-and-inspect-python-package@v2 publish: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc67ca8..d067b8d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,6 +53,34 @@ jobs: python-version: ${{ matrix.python-version }} allow-prereleases: true + - name: Read idc-index-data version from pyproject.toml + run: | + python - <<'PY' > /tmp/idc_ver.env + import tomllib, re + with open('pyproject.toml','rb') as f: + data = tomllib.load(f) + ver = None + for d in data['project']['dependencies']: + m = re.match(r'idc-index-data==(.+)', d) + if m: + ver = m.group(1) + break + assert ver, 'idc-index-data version not found in pyproject.toml' + print(f'IDC_INDEX_DATA_VERSION={ver}') + PY + cat /tmp/idc_ver.env >> $GITHUB_ENV + + - name: Preinstall idc-index-data for cache generation + env: + IDC_INDEX_DATA_VERSION: ${{ env.IDC_INDEX_DATA_VERSION }} + run: | + python -m pip install --upgrade pip + python -m pip install "idc-index-data==${IDC_INDEX_DATA_VERSION}" + + - name: Generate bundled indices cache + run: | + python scripts/generate_indices_cache.py + - name: Install package run: python -m pip install .[test] diff --git a/idc_index/index.py b/idc_index/index.py index c2bbc7b..48b1fea 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import logging import os import re @@ -11,6 +12,7 @@ import time from collections.abc import Callable from importlib.metadata import distribution, version +from importlib.resources import files from pathlib import Path import duckdb @@ -22,6 +24,12 @@ from packaging.version import Version from tqdm import tqdm +from .indices_util import ( + fetch_indices_from_github, + load_indices_cache, + save_indices_cache, +) + aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" @@ -86,32 +94,42 @@ def client(cls) -> IDCClient: return cls._client def __init__(self): + start_time = time.time() + # Read main index file + logger.debug("Starting IDCClient initialization") file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH logger.debug(f"Reading index file v{idc_index_data.__version__}") self.index = pd.read_parquet(file_path) + logger.debug(f"Index file loaded in {time.time() - start_time:.3f}s") # initialize crdc_series_uuid for the index # TODO: in the future, after https://github.com/ImagingDataCommons/idc-index/pull/113 # is merged (to minimize disruption), it will make more sense to change # idc-index-data to separate bucket from crdc_series_uuid, add support for GCP, # and consequently simplify the code here + step_start = time.time() self.index["crdc_series_uuid"] = ( self.index["series_aws_url"].str.split("/").str[3] ) + logger.debug(f"crdc_series_uuid initialized in {time.time() - step_start:.3f}s") + step_start = time.time() self.prior_versions_index_path = ( idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH ) file_path = idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH self.prior_versions_index = pd.read_parquet(file_path) + logger.debug(f"Prior versions index loaded in {time.time() - step_start:.3f}s") # self.index = self.index.astype(str).replace("nan", "") + step_start = time.time() self.index["series_size_MB"] = self.index["series_size_MB"].astype(float) self.collection_summary = self.index.groupby("collection_id").agg( {"Modality": pd.Series.unique, "series_size_MB": "sum"} ) + logger.debug(f"Collection summary computed in {time.time() - step_start:.3f}s") self.idc_version = f"v{Version(idc_index_data.__version__).major}" @@ -125,38 +143,10 @@ def __init__(self): ) self.clinical_data_dir = None - self.indices_overview = { - "index": { - "description": "Main index containing one row per DICOM series.", - "installed": True, - "url": None, - "file_path": idc_index_data.IDC_INDEX_PARQUET_FILEPATH, - }, - "prior_versions_index": { - "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.", - "installed": True, - "url": None, - "file_path": idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH, - }, - "sm_index": { - "description": "DICOM Slide Microscopy series-level index.", - "installed": False, - "url": f"{asset_endpoint_url}/sm_index.parquet", - "file_path": None, - }, - "sm_instance_index": { - "description": "DICOM Slide Microscopy instance-level index.", - "installed": False, - "url": f"{asset_endpoint_url}/sm_instance_index.parquet", - "file_path": None, - }, - "clinical_index": { - "description": "Index of clinical data accompanying the available images.", - "installed": False, - "url": f"{asset_endpoint_url}/clinical_index.parquet", - "file_path": None, - }, - } + # Discover available indices from GitHub releases with caching + step_start = time.time() + self.indices_overview = self._discover_available_indices() + logger.debug(f"Indices discovered in {time.time() - step_start:.3f}s") # these will point to the dataframes containing the respective indices, once installed self.sm_index = None @@ -164,6 +154,7 @@ def __init__(self): self.clinical_index = None # Lookup s5cmd + step_start = time.time() self.s5cmdPath = shutil.which("s5cmd") if self.s5cmdPath is None: # Workaround to support environment without a properly setup PATH @@ -181,6 +172,93 @@ def __init__(self): logger.debug(f"Found s5cmd executable: {self.s5cmdPath}") # ... and check it can be executed subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL) + logger.debug(f"s5cmd validated in {time.time() - step_start:.3f}s") + + logger.debug( + f"IDCClient initialization completed in {time.time() - start_time:.3f}s" + ) + + def _discover_available_indices(self, force_refresh=False): + """Discovers available indices from bundled cache or GitHub releases. + + First tries to load from bundled cache file, then from user cache directory. + Falls back to GitHub API if cache is missing, stale, or force_refresh is True. + + Args: + force_refresh (bool): If True, forces a fresh discovery ignoring all caches. + + Returns: + dict: Dictionary of available indices with metadata including descriptions. + """ + # imports are placed at top-level to satisfy linters; kept here for clarity reference + + user_cache_file = os.path.join(self.indices_data_dir, "indices_cache.json") + + # Define pre-installed indices + pre_installed_indices = { + "idc_index": { + "description": "Main index containing one row per DICOM series.", + "installed": True, + "url": None, + "file_path": str(idc_index_data.IDC_INDEX_PARQUET_FILEPATH), + }, + "prior_versions_index": { + "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.", + "installed": True, + "url": None, + "file_path": str(idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH), + }, + } + + if not force_refresh: + # Try bundled cache first + try: + bundled_cache_path = files("idc_index").joinpath("indices_cache.json") + if bundled_cache_path.is_file(): + cache_data = load_indices_cache(str(bundled_cache_path)) + if ( + cache_data + and cache_data.get("version") == idc_index_data.__version__ + ): + logger.debug( + f"Using bundled cache for version {idc_index_data.__version__}" + ) + return cache_data["indices"] + if cache_data: + logger.warning( + "Bundled cache version mismatch: " + f"{cache_data.get('version')} != {idc_index_data.__version__}. " + "Will attempt to fetch fresh data from GitHub." + ) + except Exception as e: + logger.debug(f"Could not load bundled cache: {e}") + + # Try user cache + cache_data = load_indices_cache(user_cache_file) + if cache_data: + if cache_data.get("version") == idc_index_data.__version__: + logger.debug( + f"Using user cache for version {idc_index_data.__version__}" + ) + return cache_data["indices"] + logger.warning( + "User cache version mismatch: " + f"{cache_data.get('version')} != {idc_index_data.__version__}. " + "Fetching fresh data from GitHub." + ) + + # Fetch from GitHub (either forced or cache miss/stale) + logger.info("Fetching indices from GitHub releases...") + indices = fetch_indices_from_github( + idc_index_data.__version__, + asset_endpoint_url, + pre_installed_indices, + ) + + # Save to user cache + save_indices_cache(user_cache_file, idc_index_data.__version__, indices) + + return indices @staticmethod def _replace_aws_with_gcp_buckets(dataframe, column_name): @@ -398,6 +476,28 @@ def fetch_index(self, index_name) -> None: self.indices_overview[index_name]["installed"] = True self.indices_overview[index_name]["file_path"] = filepath + # Also try to fetch and cache the JSON schema if not already present + if "table_description" not in self.indices_overview[index_name]: + json_url = f"{asset_endpoint_url}/{index_name}.json" + json_filepath = os.path.join( + self.indices_data_dir, f"{index_name}.json" + ) + try: + schema_response = requests.get(json_url, timeout=30) + if schema_response.status_code == 200: + with open(json_filepath, "w") as f: + f.write(schema_response.text) + schema_data = schema_response.json() + self.indices_overview[index_name]["table_description"] = ( + schema_data.get("table_description") + ) + self.indices_overview[index_name]["columns"] = ( + schema_data.get("columns", []) + ) + logger.debug(f"Fetched and cached schema for {index_name}") + except Exception as e: + logger.debug(f"Could not fetch schema for {index_name}: {e}") + else: logger.error( f"Failed to fetch index from URL {self.indices_overview[index_name]['url']}: {response.status_code}" @@ -458,6 +558,97 @@ def get_collections(self): unique_collections = self.index["collection_id"].unique() return unique_collections.tolist() + def get_index_schema(self, index_name): + """Returns the full schema for a requested index including column descriptions. + + Args: + index_name (str): Name of the index (e.g., 'sm_index', 'clinical_index'). + + Returns: + dict: Dictionary containing 'table_description' (str) and 'columns' (list of dicts + with 'name', 'type', 'mode', 'description' fields). + + Raises: + ValueError: If index_name is not recognized. + """ + if index_name not in self.indices_overview: + msg = ( + "Index '" + + str(index_name) + + "' not found. Available indices: " + + str(list(self.indices_overview.keys())) + ) + raise ValueError(msg) + + # Check if schema is already loaded + if "table_description" in self.indices_overview[index_name]: + return { + "table_description": self.indices_overview[index_name].get( + "table_description" + ), + "columns": self.indices_overview[index_name].get("columns", []), + } + + # Try to fetch schema from cached JSON file or download it + json_filepath = os.path.join(self.indices_data_dir, f"{index_name}.json") + + # Check if JSON file exists locally + if os.path.exists(json_filepath): + try: + with open(json_filepath) as f: + schema_data = json.load(f) + self.indices_overview[index_name]["table_description"] = ( + schema_data.get("table_description") + ) + self.indices_overview[index_name]["columns"] = schema_data.get( + "columns", [] + ) + return { + "table_description": schema_data.get("table_description"), + "columns": schema_data.get("columns", []), + } + except Exception as e: + logger.warning(f"Failed to read cached schema for {index_name}: {e}") + + # Try to download schema from GitHub + json_url = f"{asset_endpoint_url}/{index_name}.json" + try: + logger.debug(f"Fetching schema from {json_url}") + response = requests.get(json_url, timeout=30) + if response.status_code == 200: + schema_data = response.json() + + # Cache it locally + os.makedirs(self.indices_data_dir, exist_ok=True) + with open(json_filepath, "w") as f: + json.dump(schema_data, f, indent=2) + + # Update indices_overview + self.indices_overview[index_name]["table_description"] = ( + schema_data.get("table_description") + ) + self.indices_overview[index_name]["columns"] = schema_data.get( + "columns", [] + ) + + return { + "table_description": schema_data.get("table_description"), + "columns": schema_data.get("columns", []), + } + logger.warning( + f"Failed to fetch schema from {json_url}: {response.status_code}" + ) + return { + "table_description": f"Schema not available for {index_name}", + "columns": [], + } + except Exception as e: + logger.warning(f"Error fetching schema for {index_name}: {e}") + return { + "table_description": f"Schema not available for {index_name}", + "columns": [], + } + def get_series_size(self, seriesInstanceUID): """Gets cumulative size (MB) of the DICOM instances in a given SeriesInstanceUID. diff --git a/idc_index/indices_util.py b/idc_index/indices_util.py new file mode 100644 index 0000000..4755656 --- /dev/null +++ b/idc_index/indices_util.py @@ -0,0 +1,114 @@ +"""Shared utilities for discovering and caching index metadata.""" + +from __future__ import annotations + +import json +import logging +import os +from typing import Any + +import requests + +logger = logging.getLogger(__name__) + + +def fetch_indices_from_github( + version: str, asset_endpoint_url: str, pre_installed_indices: dict[str, Any] +) -> dict[str, Any]: + """Fetch available indices and their schemas from GitHub releases.""" + indices = pre_installed_indices.copy() + + # Fetch schemas for pre-installed indices + for index_name in list(pre_installed_indices.keys()): + json_url = f"{asset_endpoint_url}/{index_name}.json" + try: + schema_response = requests.get(json_url, timeout=30) + if schema_response.status_code == 200: + schema_data = schema_response.json() + description = schema_data.get( + "table_description", indices[index_name]["description"] + ) + indices[index_name]["description"] = description + indices[index_name]["table_description"] = schema_data.get( + "table_description" + ) + indices[index_name]["columns"] = schema_data.get("columns", []) + logger.debug(f"Fetched schema for pre-installed index {index_name}") + except Exception as e: + logger.debug(f"Error fetching schema for {index_name}: {e}") + + # Discover additional indices from GitHub releases + try: + api_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{version}" + logger.debug(f"Querying GitHub API: {api_url}") + response = requests.get(api_url, timeout=30) + + if response.status_code == 200: + release_data = response.json() + assets = release_data.get("assets", []) + parquet_assets = [ + asset["name"] for asset in assets if asset["name"].endswith(".parquet") + ] + + for parquet_name in parquet_assets: + index_name = parquet_name.replace(".parquet", "") + if index_name in pre_installed_indices: + continue + + json_url = f"{asset_endpoint_url}/{index_name}.json" + description = f"Index table: {index_name}" + schema_data = None + + try: + schema_response = requests.get(json_url, timeout=30) + if schema_response.status_code == 200: + schema_data = schema_response.json() + description = schema_data.get("table_description", description) + except Exception as e: + logger.warning(f"Error fetching schema for {index_name}: {e}") + + indices[index_name] = { + "description": description, + "installed": False, + "url": f"{asset_endpoint_url}/{parquet_name}", + "file_path": None, + } + + if schema_data: + indices[index_name]["table_description"] = schema_data.get( + "table_description" + ) + indices[index_name]["columns"] = schema_data.get("columns", []) + + except Exception as e: + logger.warning(f"Error during index discovery: {e}") + + return indices + + +def save_indices_cache( + cache_file_path: str, version: str, indices: dict[str, Any] +) -> None: + """Save indices metadata to cache file.""" + try: + os.makedirs(os.path.dirname(cache_file_path), exist_ok=True) + cache_data = {"version": version, "indices": indices} + with open(cache_file_path, "w") as f: + json.dump(cache_data, f, indent=2) + logger.debug(f"Cached indices list to {cache_file_path}") + except Exception as e: + logger.warning(f"Failed to cache indices list: {e}") + + +def load_indices_cache(cache_file_path: str) -> dict[str, Any] | None: + """Load indices metadata from cache file.""" + if not os.path.exists(cache_file_path): + return None + try: + with open(cache_file_path) as f: + cache_data = json.load(f) + if "version" in cache_data and "indices" in cache_data: + return cache_data + except (json.JSONDecodeError, KeyError) as e: + logger.warning(f"Failed to read indices cache: {e}") + return None diff --git a/noxfile.py b/noxfile.py index 4d680ac..6075b7b 100644 --- a/noxfile.py +++ b/noxfile.py @@ -2,15 +2,44 @@ import argparse import shutil +import subprocess +import sys from pathlib import Path import nox +if sys.version_info >= (3, 11): + import tomllib +else: + try: + import tomli as tomllib + except ImportError: + # If tomli is not available, install it in the current environment + subprocess.check_call([sys.executable, "-m", "pip", "install", "tomli"]) + import tomli as tomllib + DIR = Path(__file__).parent.resolve() nox.options.sessions = ["lint", "pylint", "tests"] +def get_runtime_dependencies() -> list[str]: + """Extract runtime dependencies from pyproject.toml.""" + pyproject_path = DIR / "pyproject.toml" + with open(pyproject_path, "rb") as f: + pyproject = tomllib.load(f) + + return pyproject.get("project", {}).get("dependencies", []) + + +def generate_cache(session: nox.Session) -> None: + """Generate bundled indices cache (required for package build).""" + # Install all runtime dependencies needed for the generator script + dependencies = get_runtime_dependencies() + session.install(*dependencies) + session.run("python", "scripts/generate_indices_cache.py") + + @nox.session def lint(session: nox.Session) -> None: """ @@ -29,6 +58,10 @@ def pylint(session: nox.Session) -> None: """ # This needs to be installed into the package environment, and is slower # than a pre-commit check + + # Generate bundled cache before installing package (required for build) + generate_cache(session) + session.install(".", "pylint") session.run("pylint", "idc_index", *session.posargs) @@ -38,6 +71,9 @@ def tests(session: nox.Session) -> None: """ Run the unit and regular tests. """ + # Generate bundled cache before installing package (required for build) + generate_cache(session) + session.install(".[test]") session.run("pytest", *session.posargs) @@ -60,6 +96,9 @@ def docs(session: nox.Session) -> None: extra_installs = ["sphinx-autobuild"] if args.serve else [] + # Generate bundled cache before installing package (required for build) + generate_cache(session) + session.install("-e.[docs]", *extra_installs) session.chdir("docs") @@ -113,5 +152,8 @@ def build(session: nox.Session) -> None: if build_path.exists(): shutil.rmtree(build_path) + # Generate bundled cache before building package (required for build) + generate_cache(session) + session.install("build") session.run("python", "-m", "build") diff --git a/pyproject.toml b/pyproject.toml index 89b7694..2de9746 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,6 +79,11 @@ build.hooks.vcs.version-file = "idc_index/_version.py" features = ["test"] scripts.test = "pytest {args}" +[tool.hatch.build.targets.wheel] +packages = ["idc_index"] + +[tool.hatch.build.targets.wheel.force-include] +"idc_index/indices_cache.json" = "idc_index/indices_cache.json" [tool.pytest.ini_options] minversion = "6.0" @@ -239,4 +244,5 @@ messages_control.disable = [ "undefined-loop-variable", "unspecified-encoding", "unused-variable", + "broad-exception-caught", ] diff --git a/scripts/generate_indices_cache.py b/scripts/generate_indices_cache.py new file mode 100644 index 0000000..47a9023 --- /dev/null +++ b/scripts/generate_indices_cache.py @@ -0,0 +1,138 @@ +"""Generate indices cache at build time.""" + +from __future__ import annotations + +import json +import logging +import os +from pathlib import Path + +import idc_index_data +import requests + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def fetch_indices_from_github( + version: str, asset_endpoint_url: str, pre_installed_indices: dict +) -> dict: + """Fetch available indices and their schemas from GitHub releases.""" + indices = pre_installed_indices.copy() + + # Fetch schemas for pre-installed indices + for index_name in list(pre_installed_indices.keys()): + json_url = f"{asset_endpoint_url}/{index_name}.json" + try: + schema_response = requests.get(json_url, timeout=30) + if schema_response.status_code == 200: + schema_data = schema_response.json() + description = schema_data.get( + "table_description", indices[index_name]["description"] + ) + indices[index_name]["description"] = description + indices[index_name]["table_description"] = schema_data.get( + "table_description" + ) + indices[index_name]["columns"] = schema_data.get("columns", []) + logger.debug(f"Fetched schema for pre-installed index {index_name}") + except Exception as e: + logger.debug(f"Error fetching schema for {index_name}: {e}") + + # Discover additional indices from GitHub releases + try: + api_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{version}" + logger.debug(f"Querying GitHub API: {api_url}") + response = requests.get(api_url, timeout=30) + + if response.status_code == 200: + release_data = response.json() + assets = release_data.get("assets", []) + parquet_assets = [ + asset["name"] for asset in assets if asset["name"].endswith(".parquet") + ] + + for parquet_name in parquet_assets: + index_name = parquet_name.replace(".parquet", "") + if index_name in pre_installed_indices: + continue + + json_url = f"{asset_endpoint_url}/{index_name}.json" + description = f"Index table: {index_name}" + schema_data = None + + try: + schema_response = requests.get(json_url, timeout=30) + if schema_response.status_code == 200: + schema_data = schema_response.json() + description = schema_data.get("table_description", description) + except Exception as e: + logger.warning(f"Error fetching schema for {index_name}: {e}") + + indices[index_name] = { + "description": description, + "installed": False, + "url": f"{asset_endpoint_url}/{parquet_name}", + "file_path": None, + } + + if schema_data: + indices[index_name]["table_description"] = schema_data.get( + "table_description" + ) + indices[index_name]["columns"] = schema_data.get("columns", []) + + except Exception as e: + logger.warning(f"Error during index discovery: {e}") + + return indices + + +def save_indices_cache(cache_file_path: str, version: str, indices: dict) -> None: + """Save indices metadata to cache file.""" + try: + os.makedirs(os.path.dirname(cache_file_path), exist_ok=True) + cache_data = {"version": version, "indices": indices} + with open(cache_file_path, "w") as f: + json.dump(cache_data, f, indent=2) + logger.debug(f"Cached indices list to {cache_file_path}") + except Exception as e: + logger.warning(f"Failed to cache indices list: {e}") + + +def main(): + """Generate indices cache for the current idc-index-data version.""" + version = idc_index_data.__version__ + asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{version}" + + # Define pre-installed indices + pre_installed_indices = { + "idc_index": { + "description": "Main index containing one row per DICOM series.", + "installed": True, + "url": None, + "file_path": str(idc_index_data.IDC_INDEX_PARQUET_FILEPATH), + }, + "prior_versions_index": { + "description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.", + "installed": True, + "url": None, + "file_path": str(idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH), + }, + } + + logger.info(f"Generating indices cache for version {version}") + indices = fetch_indices_from_github( + version, asset_endpoint_url, pre_installed_indices + ) + + # Save to package data directory + cache_file = Path(__file__).parent.parent / "idc_index" / "indices_cache.json" + save_indices_cache(str(cache_file), version, indices) + + logger.info(f"Successfully generated cache with {len(indices)} indices") + logger.info(f"Cache saved to: {cache_file}") + + +if __name__ == "__main__": + main() diff --git a/tests/idcindex.py b/tests/idcindex.py index ba12004..4ec24b2 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -1,13 +1,16 @@ from __future__ import annotations +import json import logging import os import tempfile import unittest +from importlib.resources import files from itertools import product from pathlib import Path from unittest.mock import patch +import idc_index_data import pandas as pd import pytest import requests @@ -604,6 +607,162 @@ def test_instance_file_URLs(self): files_gcp = c.get_instance_file_URL(sopInstanceUID, "gcs") assert files_aws == files_gcp == file_url + def test_indices_discovery(self): + """Test that indices are loaded from bundled cache or discovered.""" + c = IDCClient() + + # Check that indices_overview exists and is not empty + assert c.indices_overview is not None + assert len(c.indices_overview) > 0 + + # Check that pre-installed indices are present + assert "idc_index" in c.indices_overview + assert "prior_versions_index" in c.indices_overview + + # Verify pre-installed indices have required fields + for index_name in ["idc_index", "prior_versions_index"]: + assert c.indices_overview[index_name]["installed"] is True + assert c.indices_overview[index_name]["file_path"] is not None + assert "description" in c.indices_overview[index_name] + + # Check that additional indices are discovered (from bundled cache or GitHub) + # These should be present in the cache or GitHub release + expected_indices = ["sm_index", "sm_instance_index", "clinical_index"] + for index_name in expected_indices: + if index_name in c.indices_overview: + assert c.indices_overview[index_name]["installed"] is False + assert c.indices_overview[index_name]["url"] is not None + assert "description" in c.indices_overview[index_name] + + def test_indices_schema_from_discovery(self): + """Test that schemas are available from bundled cache or fetched from GitHub.""" + c = IDCClient() + + # Check that pre-installed indices have schema information + for index_name in ["idc_index", "prior_versions_index"]: + # Schema should be present from bundled cache or GitHub API + if "table_description" in c.indices_overview[index_name]: + assert c.indices_overview[index_name]["table_description"] is not None + assert "columns" in c.indices_overview[index_name] + assert isinstance(c.indices_overview[index_name]["columns"], list) + + def test_get_index_schema(self): + """Test the get_index_schema method.""" + c = IDCClient() + + # Test with a pre-installed index + schema = c.get_index_schema("idc_index") + assert schema is not None + assert "table_description" in schema + assert "columns" in schema + assert isinstance(schema["columns"], list) + + # Test with an invalid index name + with pytest.raises(ValueError, match="not found"): + c.get_index_schema("nonexistent_index") + + def test_indices_discovery_force_refresh(self): + """Test force refresh bypasses all caches and fetches from GitHub.""" + c = IDCClient() + + # Get initial indices overview + initial_indices = c.indices_overview.copy() + + # Force refresh (this is a private method, but we test it here) + # This should bypass both bundled and user cache and fetch from GitHub + refreshed_indices = c._discover_available_indices(force_refresh=True) + + # Check that we got indices back + assert refreshed_indices is not None + assert len(refreshed_indices) > 0 + + # Should have at least the pre-installed indices + assert "idc_index" in refreshed_indices + assert "prior_versions_index" in refreshed_indices + + def test_indices_cache_file(self): + """Test that user cache file is created when force_refresh is used.""" + + c = IDCClient() + + # Force a refresh to ensure user cache is created + c._discover_available_indices(force_refresh=True) + + cache_file = os.path.join(c.indices_data_dir, "indices_cache.json") + + # User cache file should now exist after force refresh + if os.path.exists(cache_file): + with open(cache_file) as f: + cache_data = json.load(f) + + assert "version" in cache_data + assert "indices" in cache_data + assert isinstance(cache_data["indices"], dict) + + def test_fetch_index_with_schema(self): + """Test that fetching an index also has schema information from cache.""" + c = IDCClient() + + # Fetch an index that's not pre-installed + if "sm_index" in c.indices_overview: + c.fetch_index("sm_index") + + # After fetching, the index should be marked as installed + assert c.indices_overview["sm_index"]["installed"] is True + + # Schema information should be available from bundled cache or GitHub + if "table_description" in c.indices_overview["sm_index"]: + assert c.indices_overview["sm_index"]["table_description"] is not None + + def test_bundled_cache_loaded(self): + """Test that bundled cache is loaded when available and version matches.""" + + # Check if bundled cache exists + try: + bundled_cache_path = files("idc_index").joinpath("indices_cache.json") + if bundled_cache_path.is_file(): + with open(bundled_cache_path) as f: + bundled_cache = json.load(f) + + # Create a client which should load from bundled cache + c = IDCClient() + + # Verify indices were loaded + assert c.indices_overview is not None + assert len(c.indices_overview) > 0 + + # Check that version matches + assert bundled_cache["version"] == idc_index_data.__version__ + except Exception as e: + # If bundled cache doesn't exist, test passes + # (this is expected during development before cache is generated) + pytest.skip(f"Bundled cache not available: {e}") + + def test_version_mismatch_warning(self): + """Test that version mismatch triggers warning and fallback to GitHub.""" + + c = IDCClient() + + # Create a user cache with wrong version + cache_file = os.path.join(c.indices_data_dir, "indices_cache.json") + wrong_version_cache = {"version": "99.99.99", "indices": {"test_index": {}}} + + os.makedirs(c.indices_data_dir, exist_ok=True) + with open(cache_file, "w") as f: + json.dump(wrong_version_cache, f) + + # Mock bundled cache to not exist for this test + with patch("importlib.resources.files") as mock_files: + mock_files.return_value.joinpath.return_value.is_file.return_value = False + + # This should trigger a version mismatch and fetch from GitHub + # (will log a warning but not fail) + indices = c._discover_available_indices() + + # Should still get valid indices from GitHub + assert indices is not None + assert len(indices) > 0 + class TestInsufficientDiskSpaceException(unittest.TestCase): def setUp(self):