Skip to content

Commit 9fb9a62

Browse files
Copilotfedorov
andcommitted
Switch from GitHub API to direct download endpoints to avoid rate limits
Co-authored-by: fedorov <313942+fedorov@users.noreply.github.com>
1 parent de287b9 commit 9fb9a62

File tree

1 file changed

+34
-116
lines changed

1 file changed

+34
-116
lines changed

idc_index/index.py

Lines changed: 34 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
aws_endpoint_url = "https://s3.amazonaws.com"
2727
gcp_endpoint_url = "https://storage.googleapis.com"
2828
asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}"
29-
github_api_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}"
3029

3130
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
3231
logger = logging.getLogger(__name__)
@@ -161,9 +160,8 @@ def __init__(self):
161160
def _discover_available_indices(self, refresh: bool = False) -> dict:
162161
"""Discover available index tables from the idc-index-data GitHub release assets.
163162
164-
This method attempts to discover available index parquet files by first trying
165-
the GitHub releases API, and falling back to a known list of indices if the API
166-
is unavailable. In either case, it populates descriptions from the accompanying
163+
This method discovers available index parquet files using direct download URLs
164+
from the GitHub releases. It populates descriptions from the accompanying
167165
JSON schema files.
168166
169167
Args:
@@ -207,120 +205,40 @@ def _discover_available_indices(self, refresh: bool = False) -> dict:
207205
"analysis_results_index",
208206
]
209207

210-
# Try to discover additional indices from the GitHub release API
211-
discovered_from_api = False
212-
try:
213-
response = requests.get(github_api_url, timeout=30)
214-
if response.status_code == 200:
215-
release_data = response.json()
216-
assets = release_data.get("assets", [])
217-
218-
# Find all parquet files in the release assets
219-
parquet_assets = {
220-
a["name"]: a["browser_download_url"]
221-
for a in assets
222-
if a["name"].endswith(".parquet")
223-
}
224-
225-
# Find all json schema files in the release assets
226-
json_assets = {
227-
a["name"]: a["browser_download_url"]
228-
for a in assets
229-
if a["name"].endswith(".json")
230-
}
231-
232-
discovered_from_api = True
233-
234-
# Update descriptions for bundled indices if available
235-
for bundled_name, schema_filename in [
236-
("index", "idc_index.json"),
237-
("prior_versions_index", "prior_versions_index.json"),
238-
]:
239-
if schema_filename in json_assets:
240-
schema = self._fetch_index_schema_from_url(
241-
json_assets[schema_filename]
242-
)
243-
if schema and "table_description" in schema:
244-
indices[bundled_name]["description"] = schema[
245-
"table_description"
246-
]
247-
248-
# Process discovered parquet files
249-
for parquet_name, parquet_url in parquet_assets.items():
250-
# Extract index name from filename
251-
index_name = parquet_name.replace(".parquet", "")
252-
253-
# Skip bundled indices
254-
if index_name in ("idc_index", "prior_versions_index"):
255-
continue
256-
257-
# Determine description from schema file
258-
description = ""
259-
schema_name = f"{index_name}.json"
260-
if schema_name in json_assets:
261-
schema = self._fetch_index_schema_from_url(
262-
json_assets[schema_name]
263-
)
264-
if schema:
265-
description = schema.get("table_description", "")
208+
# Discover indices using direct download URLs (no API rate limits)
209+
for index_name in known_remote_indices:
210+
# Try to fetch the schema directly
211+
schema_url = f"{asset_endpoint_url}/{index_name}.json"
212+
parquet_url = f"{asset_endpoint_url}/{index_name}.parquet"
266213

267-
# Check if the index is already installed locally
268-
local_path = os.path.join(
269-
self.indices_data_dir, f"{index_name}.parquet"
270-
)
271-
installed = os.path.exists(local_path)
272-
file_path = local_path if installed else None
273-
274-
indices[index_name] = {
275-
"description": description,
276-
"installed": installed,
277-
"url": parquet_url,
278-
"file_path": file_path,
279-
}
214+
description = ""
215+
schema = self._fetch_index_schema_from_url(schema_url)
216+
if schema:
217+
description = schema.get("table_description", "")
280218

281-
else:
282-
logger.debug(
283-
f"GitHub API returned status {response.status_code}. "
284-
"Using known index list with schema discovery."
285-
)
286-
except requests.exceptions.RequestException as e:
287-
logger.debug(f"GitHub API request failed: {e}. Using known index list.")
288-
289-
# If API discovery failed, use known list and try to fetch schemas directly
290-
if not discovered_from_api:
291-
for index_name in known_remote_indices:
292-
# Try to fetch the schema directly
293-
schema_url = f"{asset_endpoint_url}/{index_name}.json"
294-
parquet_url = f"{asset_endpoint_url}/{index_name}.parquet"
295-
296-
description = ""
297-
schema = self._fetch_index_schema_from_url(schema_url)
298-
if schema:
299-
description = schema.get("table_description", "")
300-
301-
# Check if the index is already installed locally
302-
local_path = os.path.join(
303-
self.indices_data_dir, f"{index_name}.parquet"
304-
)
305-
installed = os.path.exists(local_path)
306-
file_path = local_path if installed else None
307-
308-
indices[index_name] = {
309-
"description": description,
310-
"installed": installed,
311-
"url": parquet_url,
312-
"file_path": file_path,
313-
}
314-
315-
# Also try to update bundled index descriptions
316-
for bundled_name, schema_filename in [
317-
("index", "idc_index.json"),
318-
("prior_versions_index", "prior_versions_index.json"),
319-
]:
320-
schema_url = f"{asset_endpoint_url}/{schema_filename}"
321-
schema = self._fetch_index_schema_from_url(schema_url)
322-
if schema and "table_description" in schema:
323-
indices[bundled_name]["description"] = schema["table_description"]
219+
# Check if the index is already installed locally
220+
local_path = os.path.join(
221+
self.indices_data_dir, f"{index_name}.parquet"
222+
)
223+
installed = os.path.exists(local_path)
224+
file_path = local_path if installed else None
225+
226+
indices[index_name] = {
227+
"description": description,
228+
"installed": installed,
229+
"url": parquet_url,
230+
"file_path": file_path,
231+
}
232+
233+
# Also try to update bundled index descriptions from schema files
234+
for bundled_name, schema_filename in [
235+
("index", "idc_index.json"),
236+
("prior_versions_index", "prior_versions_index.json"),
237+
]:
238+
schema_url = f"{asset_endpoint_url}/{schema_filename}"
239+
schema = self._fetch_index_schema_from_url(schema_url)
240+
if schema and "table_description" in schema:
241+
indices[bundled_name]["description"] = schema["table_description"]
324242

325243
return indices
326244

0 commit comments

Comments
 (0)