|
26 | 26 | aws_endpoint_url = "https://s3.amazonaws.com" |
27 | 27 | gcp_endpoint_url = "https://storage.googleapis.com" |
28 | 28 | asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" |
29 | | -github_api_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}" |
30 | 29 |
|
31 | 30 | logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) |
32 | 31 | logger = logging.getLogger(__name__) |
@@ -161,9 +160,8 @@ def __init__(self): |
161 | 160 | def _discover_available_indices(self, refresh: bool = False) -> dict: |
162 | 161 | """Discover available index tables from the idc-index-data GitHub release assets. |
163 | 162 |
|
164 | | - This method attempts to discover available index parquet files by first trying |
165 | | - the GitHub releases API, and falling back to a known list of indices if the API |
166 | | - is unavailable. In either case, it populates descriptions from the accompanying |
| 163 | + This method discovers available index parquet files using direct download URLs |
| 164 | + from the GitHub releases. It populates descriptions from the accompanying |
167 | 165 | JSON schema files. |
168 | 166 |
|
169 | 167 | Args: |
@@ -207,120 +205,40 @@ def _discover_available_indices(self, refresh: bool = False) -> dict: |
207 | 205 | "analysis_results_index", |
208 | 206 | ] |
209 | 207 |
|
210 | | - # Try to discover additional indices from the GitHub release API |
211 | | - discovered_from_api = False |
212 | | - try: |
213 | | - response = requests.get(github_api_url, timeout=30) |
214 | | - if response.status_code == 200: |
215 | | - release_data = response.json() |
216 | | - assets = release_data.get("assets", []) |
217 | | - |
218 | | - # Find all parquet files in the release assets |
219 | | - parquet_assets = { |
220 | | - a["name"]: a["browser_download_url"] |
221 | | - for a in assets |
222 | | - if a["name"].endswith(".parquet") |
223 | | - } |
224 | | - |
225 | | - # Find all json schema files in the release assets |
226 | | - json_assets = { |
227 | | - a["name"]: a["browser_download_url"] |
228 | | - for a in assets |
229 | | - if a["name"].endswith(".json") |
230 | | - } |
231 | | - |
232 | | - discovered_from_api = True |
233 | | - |
234 | | - # Update descriptions for bundled indices if available |
235 | | - for bundled_name, schema_filename in [ |
236 | | - ("index", "idc_index.json"), |
237 | | - ("prior_versions_index", "prior_versions_index.json"), |
238 | | - ]: |
239 | | - if schema_filename in json_assets: |
240 | | - schema = self._fetch_index_schema_from_url( |
241 | | - json_assets[schema_filename] |
242 | | - ) |
243 | | - if schema and "table_description" in schema: |
244 | | - indices[bundled_name]["description"] = schema[ |
245 | | - "table_description" |
246 | | - ] |
247 | | - |
248 | | - # Process discovered parquet files |
249 | | - for parquet_name, parquet_url in parquet_assets.items(): |
250 | | - # Extract index name from filename |
251 | | - index_name = parquet_name.replace(".parquet", "") |
252 | | - |
253 | | - # Skip bundled indices |
254 | | - if index_name in ("idc_index", "prior_versions_index"): |
255 | | - continue |
256 | | - |
257 | | - # Determine description from schema file |
258 | | - description = "" |
259 | | - schema_name = f"{index_name}.json" |
260 | | - if schema_name in json_assets: |
261 | | - schema = self._fetch_index_schema_from_url( |
262 | | - json_assets[schema_name] |
263 | | - ) |
264 | | - if schema: |
265 | | - description = schema.get("table_description", "") |
| 208 | + # Discover indices using direct download URLs (no API rate limits) |
| 209 | + for index_name in known_remote_indices: |
| 210 | + # Try to fetch the schema directly |
| 211 | + schema_url = f"{asset_endpoint_url}/{index_name}.json" |
| 212 | + parquet_url = f"{asset_endpoint_url}/{index_name}.parquet" |
266 | 213 |
|
267 | | - # Check if the index is already installed locally |
268 | | - local_path = os.path.join( |
269 | | - self.indices_data_dir, f"{index_name}.parquet" |
270 | | - ) |
271 | | - installed = os.path.exists(local_path) |
272 | | - file_path = local_path if installed else None |
273 | | - |
274 | | - indices[index_name] = { |
275 | | - "description": description, |
276 | | - "installed": installed, |
277 | | - "url": parquet_url, |
278 | | - "file_path": file_path, |
279 | | - } |
| 214 | + description = "" |
| 215 | + schema = self._fetch_index_schema_from_url(schema_url) |
| 216 | + if schema: |
| 217 | + description = schema.get("table_description", "") |
280 | 218 |
|
281 | | - else: |
282 | | - logger.debug( |
283 | | - f"GitHub API returned status {response.status_code}. " |
284 | | - "Using known index list with schema discovery." |
285 | | - ) |
286 | | - except requests.exceptions.RequestException as e: |
287 | | - logger.debug(f"GitHub API request failed: {e}. Using known index list.") |
288 | | - |
289 | | - # If API discovery failed, use known list and try to fetch schemas directly |
290 | | - if not discovered_from_api: |
291 | | - for index_name in known_remote_indices: |
292 | | - # Try to fetch the schema directly |
293 | | - schema_url = f"{asset_endpoint_url}/{index_name}.json" |
294 | | - parquet_url = f"{asset_endpoint_url}/{index_name}.parquet" |
295 | | - |
296 | | - description = "" |
297 | | - schema = self._fetch_index_schema_from_url(schema_url) |
298 | | - if schema: |
299 | | - description = schema.get("table_description", "") |
300 | | - |
301 | | - # Check if the index is already installed locally |
302 | | - local_path = os.path.join( |
303 | | - self.indices_data_dir, f"{index_name}.parquet" |
304 | | - ) |
305 | | - installed = os.path.exists(local_path) |
306 | | - file_path = local_path if installed else None |
307 | | - |
308 | | - indices[index_name] = { |
309 | | - "description": description, |
310 | | - "installed": installed, |
311 | | - "url": parquet_url, |
312 | | - "file_path": file_path, |
313 | | - } |
314 | | - |
315 | | - # Also try to update bundled index descriptions |
316 | | - for bundled_name, schema_filename in [ |
317 | | - ("index", "idc_index.json"), |
318 | | - ("prior_versions_index", "prior_versions_index.json"), |
319 | | - ]: |
320 | | - schema_url = f"{asset_endpoint_url}/{schema_filename}" |
321 | | - schema = self._fetch_index_schema_from_url(schema_url) |
322 | | - if schema and "table_description" in schema: |
323 | | - indices[bundled_name]["description"] = schema["table_description"] |
| 219 | + # Check if the index is already installed locally |
| 220 | + local_path = os.path.join( |
| 221 | + self.indices_data_dir, f"{index_name}.parquet" |
| 222 | + ) |
| 223 | + installed = os.path.exists(local_path) |
| 224 | + file_path = local_path if installed else None |
| 225 | + |
| 226 | + indices[index_name] = { |
| 227 | + "description": description, |
| 228 | + "installed": installed, |
| 229 | + "url": parquet_url, |
| 230 | + "file_path": file_path, |
| 231 | + } |
| 232 | + |
| 233 | + # Also try to update bundled index descriptions from schema files |
| 234 | + for bundled_name, schema_filename in [ |
| 235 | + ("index", "idc_index.json"), |
| 236 | + ("prior_versions_index", "prior_versions_index.json"), |
| 237 | + ]: |
| 238 | + schema_url = f"{asset_endpoint_url}/{schema_filename}" |
| 239 | + schema = self._fetch_index_schema_from_url(schema_url) |
| 240 | + if schema and "table_description" in schema: |
| 241 | + indices[bundled_name]["description"] = schema["table_description"] |
324 | 242 |
|
325 | 243 | return indices |
326 | 244 |
|
|
0 commit comments