From ada0be80368a6cf516584626f41fc7d026e90792 Mon Sep 17 00:00:00 2001 From: pciturri Date: Fri, 23 Jan 2026 18:11:22 -0300 Subject: [PATCH 1/2] fix: zenodo queries are now done per file per model (flavour). Reduced query rate to avoid being flagged by zenodo's api. Added throttle and captured more exceptions caused by other server responses. Avoid forceful download if file already existed. tutorials: reduced amount of models for zenodo's tutorial d). --- floatcsep/model.py | 17 ++- floatcsep/utils/accessors.py | 161 ++++++++++++---------- tests/integration/test_model_accessors.py | 4 +- tests/unit/test_accessors.py | 2 +- tests/unit/test_model.py | 2 +- tutorials/case_d/models.yml | 12 +- 6 files changed, 108 insertions(+), 90 deletions(-) diff --git a/floatcsep/model.py b/floatcsep/model.py index 5afe124..50d6a6a 100644 --- a/floatcsep/model.py +++ b/floatcsep/model.py @@ -196,10 +196,23 @@ def get_source(self) -> None: os.makedirs(container, exist_ok=True) + if expected_file.exists() and expected_file.is_file() and not self.force_stage: + return + + os.makedirs(container, exist_ok=True) + + if expected_file.exists() and expected_file.is_file() and not self.force_stage: + return + if self.giturl: from_git(self.giturl, str(container), branch=self.repo_hash, force=self.force_stage) elif self.zenodo_id: - from_zenodo(self.zenodo_id, str(container), force=True) + from_zenodo( + self.zenodo_id, + str(container), + force=self.force_stage, + keys=[expected_file.name], + ) else: pass @@ -338,7 +351,7 @@ def get_source(self, zenodo_id: int = None, giturl: str = None, **kwargs) -> Non if self.giturl: from_git(self.giturl, target_dir.as_posix(), branch=self.repo_hash, force=False) elif self.zenodo_id: - from_zenodo(self.zenodo_id, target_dir.as_posix(), force=True) + from_zenodo(self.zenodo_id, target_dir.as_posix(), force=self.force_stage) else: pass diff --git a/floatcsep/utils/accessors.py b/floatcsep/utils/accessors.py index 998fabb..a2ef4a7 100644 --- a/floatcsep/utils/accessors.py +++ b/floatcsep/utils/accessors.py @@ -1,66 +1,100 @@ +import time import git import requests import hashlib import os -import sys import shutil -def from_zenodo(record_id, folder, force=False): - """ - Download data from a Zenodo repository. +def from_zenodo(record_id, folder, force=False, keys=None): + record_url = f"https://zenodo.org/api/records/{record_id}" + max_tries = 5 - Downloads if file does not exist, checksum has changed in local respect to url or force + os.makedirs(folder, exist_ok=True) - Args: - record_id: corresponding to the Zenodo repository - folder: where the repository files will be downloaded - force: force download even if file exists and checksum passes + for attempt in range(1, max_tries + 1): + r = requests.get(record_url, timeout=30, headers={"User-Agent": "floatcsep"}) - Returns: - """ - # Grab the urls and filenames and checksums - r = requests.get(f"https://zenodo.org/api/records/{record_id}", timeout=30) - download_urls = [f["links"]["self"] for f in r.json()["files"]] - filenames = [(f["key"], f["checksum"]) for f in r.json()["files"]] + if r.status_code == 200: + break + + if r.status_code == 403: + text = (r.text or "").lower() + if "unusual traffic" in text or " None: - """ - Downloads files (from zenodo). - - Args: - url (str): the url where the file is located - filename (str): the filename required. - """ - progress_bar_length = 72 - block_size = 1024 - - r = requests.get(url, timeout=30, stream=True) - total_size = r.headers.get("content-length", False) - if not total_size: - with requests.head(url, timeout=30) as h: - try: - total_size = int(h.headers.get("Content-Length", 0)) - except TypeError: - total_size = 0 - else: - total_size = int(total_size) - download_size = 0 + os.makedirs(os.path.dirname(filename) or ".", exist_ok=True) + + r = requests.get(url, timeout=30, stream=True, headers={"User-Agent": "floatcsep"}) + r.raise_for_status() + + cl = r.headers.get("Content-Length") or r.headers.get("content-length") + try: + total_size = int(cl) if cl else 0 + except ValueError: + total_size = 0 + + base = os.path.basename(filename) if total_size: - print(f"Downloading file with size of {total_size / block_size:.3f} kB") + print(f"{base} ({total_size / (1024 * 1024):.2f} MB)") else: - print("Downloading file with unknown size") + print(f"{base}") + with open(filename, "wb") as f: - for data in r.iter_content(chunk_size=block_size): - download_size += len(data) + for data in r.iter_content(chunk_size=1024 * 64): + if not data: + continue f.write(data) - if total_size: - progress = int(progress_bar_length * download_size / total_size) - sys.stdout.write( - "\r[{}{}] {:.1f}%".format( - "█" * progress, - "." * (progress_bar_length - progress), - 100 * download_size / total_size, - ) - ) - sys.stdout.flush() - sys.stdout.write("\n") + + print(f"Complete: {base}") def check_hash(filename, checksum): - """Checks if existing file hash matches checksum from url.""" algorithm, value = checksum.split(":") if not os.path.exists(filename): return value, "invalid" diff --git a/tests/integration/test_model_accessors.py b/tests/integration/test_model_accessors.py index c4fdf03..f871078 100644 --- a/tests/integration/test_model_accessors.py +++ b/tests/integration/test_model_accessors.py @@ -216,9 +216,7 @@ def test_zenodo_fail(self): model = self.init_model(name=name, model_path=path_, zenodo_id=13117711) - with self.assertRaises( - Exception - ): # Mostly for FileNotFound, but connection errors can also arise + with self.assertRaises(FileNotFoundError): model.get_source() shutil.rmtree(dir_, ignore_errors=True) diff --git a/tests/unit/test_accessors.py b/tests/unit/test_accessors.py index e4bfa00..dbcb743 100644 --- a/tests/unit/test_accessors.py +++ b/tests/unit/test_accessors.py @@ -88,7 +88,7 @@ def test_zenodo_query(self): self._assert_files_ok() return try: - from_zenodo(4739912, zenodo_dir()) + from_zenodo(4739912, zenodo_dir(), keys=["dummy.txt", "dummy.tar"]) except Exception as e: self.skipTest(f"Zenodo flaky/unavailable: {e!r}") diff --git a/tests/unit/test_model.py b/tests/unit/test_model.py index af508b1..0173164 100644 --- a/tests/unit/test_model.py +++ b/tests/unit/test_model.py @@ -283,7 +283,7 @@ def test_get_source( mock_from_zenodo.assert_called_once_with( self.model.zenodo_id, self.mock_registry_instance.path.as_posix(), - force=True, + force=False, ) mock_from_git.assert_not_called() diff --git a/tutorials/case_d/models.yml b/tutorials/case_d/models.yml index aa19ea6..ca8c11f 100644 --- a/tutorials/case_d/models.yml +++ b/tutorials/case_d/models.yml @@ -5,20 +5,10 @@ N10L11: TEAM=N10L11.csv N25L11: TEAM=N25L11.csv N50L11: TEAM=N50L11.csv - N100L11: TEAM=N100L11.csv - SN10L11: TEAM=SN10L11.csv - SN25L11: TEAM=SN25L11.csv - SN50L11: TEAM=SN50L11.csv - SN100L11: TEAM=SN100L11.csv - WHEEL: zenodo_id: 6255575 path: models/wheel flavours: N10L11: WHEEL=N10L11.csv N25L11: WHEEL=N25L11.csv - N50L11: WHEEL=N50L11.csv - N100L11: WHEEL=N100L11.csv - SN10L11: WHEEL=SN10L11.csv - SN25L11: WHEEL=SN25L11.csv - SN50L11: WHEEL=SN50L11.csv - SN100L11: WHEEL=SN100L11.csv \ No newline at end of file + N50L11: WHEEL=N50L11.csv \ No newline at end of file From 811e7c5793dfe0d421b3e2fff5285e68e2fe3686 Mon Sep 17 00:00:00 2001 From: pciturri Date: Fri, 23 Jan 2026 19:07:33 -0300 Subject: [PATCH 2/2] fix: removed run.py objects from the Dockerfiles in case_i, which were removed from the model repository. fix: timedependent get_source() are able to clone into empty folder. --- floatcsep/model.py | 8 +++++++- tutorials/case_i/pymock/Dockerfile | 2 +- tutorials/case_i/pymock_slow/Dockerfile | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/floatcsep/model.py b/floatcsep/model.py index 50d6a6a..7a00a0d 100644 --- a/floatcsep/model.py +++ b/floatcsep/model.py @@ -310,7 +310,13 @@ def stage( and those to be generated, as well as input catalog and arguments file. """ - if self.force_stage or not self.registry.path.exists(): + need_source = ( + self.force_stage + or not self.registry.path.exists() + or (self.registry.path.is_dir() and not any(self.registry.path.iterdir())) + ) + + if need_source: os.makedirs(self.registry.dir, exist_ok=True) self.get_source(self.zenodo_id, self.giturl, branch=self.repo_hash) diff --git a/tutorials/case_i/pymock/Dockerfile b/tutorials/case_i/pymock/Dockerfile index 46f7a43..dd07e26 100644 --- a/tutorials/case_i/pymock/Dockerfile +++ b/tutorials/case_i/pymock/Dockerfile @@ -28,7 +28,7 @@ RUN python3 -m venv $VIRTUAL_ENV && pip install --upgrade pip setuptools wheel # Copy the repository from the local machine to the Docker container. ## *Only the needed folders/files for the model build* COPY --chown=$USER_UID:$USER_GID pymock/ ./pymock/ -COPY --chown=$USER_UID:$USER_GID setup.cfg run.py setup.py ./ +COPY --chown=$USER_UID:$USER_GID setup.cfg setup.py ./ # Install the pymock package. ## *Uses pip to install setup.cfg and requirements/instructions therein* diff --git a/tutorials/case_i/pymock_slow/Dockerfile b/tutorials/case_i/pymock_slow/Dockerfile index 46f7a43..dd07e26 100644 --- a/tutorials/case_i/pymock_slow/Dockerfile +++ b/tutorials/case_i/pymock_slow/Dockerfile @@ -28,7 +28,7 @@ RUN python3 -m venv $VIRTUAL_ENV && pip install --upgrade pip setuptools wheel # Copy the repository from the local machine to the Docker container. ## *Only the needed folders/files for the model build* COPY --chown=$USER_UID:$USER_GID pymock/ ./pymock/ -COPY --chown=$USER_UID:$USER_GID setup.cfg run.py setup.py ./ +COPY --chown=$USER_UID:$USER_GID setup.cfg setup.py ./ # Install the pymock package. ## *Uses pip to install setup.cfg and requirements/instructions therein*