From 566a067f17d5cc393b0dec2eedadce27055083fd Mon Sep 17 00:00:00 2001 From: Michael Smit Date: Thu, 15 May 2025 15:05:54 -0700 Subject: [PATCH] download_file_from_gcs now uses cache. Related to PolicyEngine/issues#350 this change re-implements the download_file_from_gcs function to use CachingGoogleStorageClient. Files should now be cached locally on a per-process basis and only updated on disk when the remote version crc changes. --- changelog_entry.yaml | 4 ++ policyengine/utils/google_cloud_bucket.py | 38 +++++++++++-------- tests/utils/data/test_google_cloud_bucket.py | 39 ++++++++++++++++++++ 3 files changed, 66 insertions(+), 15 deletions(-) create mode 100644 tests/utils/data/test_google_cloud_bucket.py diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..d13c4b0e 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - downloads from google storage should now be properly cached. diff --git a/policyengine/utils/google_cloud_bucket.py b/policyengine/utils/google_cloud_bucket.py index 14dc9031..d65e872e 100644 --- a/policyengine/utils/google_cloud_bucket.py +++ b/policyengine/utils/google_cloud_bucket.py @@ -1,3 +1,23 @@ +from .data.caching_google_storage_client import CachingGoogleStorageClient +import asyncio +from pathlib import Path + +_caching_client: CachingGoogleStorageClient | None = None + + +def _clear_client(): + global _caching_client + _caching_client = None + + +def _get_client(): + global _caching_client + if _caching_client is not None: + return _caching_client + _caching_client = CachingGoogleStorageClient() + return _caching_client + + def download_file_from_gcs( bucket_name: str, file_name: str, destination_path: str ) -> None: @@ -12,18 +32,6 @@ def download_file_from_gcs( Returns: None """ - from google.cloud import storage - - # Initialize a client - client = storage.Client() - - # Get the bucket - bucket = client.bucket(bucket_name) - - # Create a blob object from the file name - blob = bucket.blob(file_name) - - # Download the file to a local path - blob.download_to_filename(destination_path) - - return destination_path + asyncio.run( + _get_client().download(bucket_name, file_name, Path(destination_path)) + ) diff --git a/tests/utils/data/test_google_cloud_bucket.py b/tests/utils/data/test_google_cloud_bucket.py new file mode 100644 index 00000000..c141c0f9 --- /dev/null +++ b/tests/utils/data/test_google_cloud_bucket.py @@ -0,0 +1,39 @@ +from unittest import TestCase +from unittest.mock import patch +import pytest +from pathlib import Path +from policyengine.utils.google_cloud_bucket import ( + download_file_from_gcs, + _clear_client, +) + + +class TestGoogleCloudBucket(TestCase): + def setUp(self): + _clear_client() + + @patch( + "policyengine.utils.google_cloud_bucket.CachingGoogleStorageClient", + autospec=True, + ) + def test_download_uses_storage_client(self, client_class): + client_instance = client_class.return_value + download_file_from_gcs( + "TEST_BUCKET", "TEST/FILE/NAME.TXT", "TARGET/PATH" + ) + client_instance.download.assert_called_with( + "TEST_BUCKET", "TEST/FILE/NAME.TXT", Path("TARGET/PATH") + ) + + @patch( + "policyengine.utils.google_cloud_bucket.CachingGoogleStorageClient", + autospec=True, + ) + def test_download_only_creates_client_once(self, client_class): + download_file_from_gcs( + "TEST_BUCKET", "TEST/FILE/NAME.TXT", "TARGET/PATH" + ) + download_file_from_gcs( + "TEST_BUCKET", "TEST/FILE/NAME.TXT", "ANOTHER/PATH" + ) + client_class.assert_called_once()