From 260cbc943571d9f2955edf616bd57d46e689512a Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Tue, 24 Feb 2026 14:18:44 -0500 Subject: [PATCH 1/3] reworked get_accessibility_data_dict and its accompanying helper function in dataset_helper_object.py to accommodate published protected datases and return a url to their public version. Also reworked everything to use a single neo4j query to improve performance --- src/app.py | 30 ++--- src/dataset_helper_object.py | 240 +++++++++++++++++++---------------- 2 files changed, 142 insertions(+), 128 deletions(-) diff --git a/src/app.py b/src/app.py index 91f041ba..ed86d2cb 100644 --- a/src/app.py +++ b/src/app.py @@ -20,7 +20,7 @@ import werkzeug.exceptions from hubmap_sdk import EntitySdk, sdk_helper from apscheduler.schedulers.background import BackgroundScheduler -from neo4j.exceptions import TransactionError +from neo4j.exceptions import TransactionError, Neo4jError from apscheduler.triggers.interval import IntervalTrigger from apscheduler.triggers.date import DateTrigger # Don't confuse urllib (Python native library) with urllib3 (3rd-party library, requests also uses urllib3) @@ -613,22 +613,18 @@ def get_accessible_data_directories(): if not isinstance(identifier, str): bad_request_error('The Request payload JSON Array must contain only identifier strings.') - payload_accessibility_dict = {} - for identifier in json_payload: - try: - identifier_accessibility_dict = dataset_helper.get_entity_accessibility(identifier - , user_token - , user_data_access_level=user_data_access_level) - payload_accessibility_dict[identifier] = identifier_accessibility_dict - except (HTTPException, sdk_helper.HTTPException) as he: - return jsonify({'error': he.get_description()}), he.get_status_code() - except ValueError as ve: - logger.error(str(ve)) - return jsonify({'error': str(ve)}), 400 - except Exception as e: - logger.error(e, exc_info=True) - return Response("Unexpected error: " + str(e), 500) - return jsonify(payload_accessibility_dict), 200 + try: + identifier_accessibility_dict = dataset_helper.get_entity_accessibility(neo4j_driver_instance, json_payload, user_data_access_level=user_data_access_level) + except Neo4jError as ne: + logger.error(str(ne.message)) + return jsonify({'Unexpected error': 'Failed to retrieve accessibility info from Neo4j. Check the logs'}), 500 + except ValueError as ve: + logger.error(str(ve)) + return jsonify({'error': str(ve)}), 400 + except Exception as e: + logger.error(e, exc_info=True) + return Response("Unexpected error: " + str(e), 500) + return jsonify(identifier_accessibility_dict), 200 """ Retrieve the path of Datasets or Uploads relative to the Globus endpoint mount point give from a list of entity uuids diff --git a/src/dataset_helper_object.py b/src/dataset_helper_object.py index 89d3ad19..e05e33a8 100644 --- a/src/dataset_helper_object.py +++ b/src/dataset_helper_object.py @@ -5,6 +5,7 @@ import requests import logging from flask import Flask +from neo4j.exceptions import Neo4jError from hubmap_commons.hubmap_const import HubmapConst from hubmap_sdk import EntitySdk, SearchSdk, sdk_helper from pandas.core.array_algos.take import take_nd @@ -128,121 +129,138 @@ def verify_dataset_title_info(self, dataset_uuid: str, user_token: str) -> array # # Returns a JSON Object containing accessibility information for the entity. # - def get_entity_accessibility(self, entity_id: str, user_token: str, user_data_access_level: dict = None) -> dict: - entity_api = EntitySdk(token=user_token, service_url=_entity_api_url) + def get_entity_accessibility(self, neo4j_driver, json_payload, user_data_access_level: dict = None) -> dict: supported_entity_type_list = ['Dataset', 'Upload'] + accessibility_dicts = {} - # Grab the entity from the entity-api service. - try: - sdk_entity = entity_api.get_entity_by_id(entity_id) - except sdk_helper.HTTPException as he: - # Determine if this entity_id should be shown as inaccessible in an - # HTTP 200 Response. Otherwise, let the HTTPException be processed - if he.status_code == 404: - # We will log when the user is checking on entities which are inaccessible. - logger.debug(f"User accessibility retrieval of non-valid {entity_id}" - f" resulted in {he.status_code} exception he={str(he)}") - # Create a simple dict when entity_id is not for an existing entity - return {'valid_id': False} - elif he.status_code == 403: - # We will log when the user is checking on entities which are inaccessible. - logger.debug(f"User accessibility retrieval of valid, inaccessible {entity_id}" - f" resulted in {he.status_code} exception he={str(he)}") - # Create a simple dict when entity_id is not for an existing entity - return {'valid_id': True - , 'access_allowed': False} + query = ( + "MATCH (e:Entity) " + "WHERE e.uuid IN $ids OR e.hubmap_id IN $ids " + "RETURN COLLECT({" + "uuid: e.uuid, " + "hubmap_id: e.hubmap_id, " + "entity_type: e.entity_type, " + "status: e.status, " + "group_name: e.group_name, " + "group_uuid: e.group_uuid, " + "contains_human_genetic_sequences: e.contains_human_genetic_sequences, " + "data_access_level: e.data_access_level" + "}) AS entities" + ) + + with neo4j_driver.session() as session: + result = session.run(query, ids=json_payload) + record = result.single() + entities = record["entities"] if record else [] + + requested_ids = set(json_payload) + matched_ids = set() + for e in entities: + if e.get("uuid"): + matched_ids.add(e["uuid"]) + if e.get("hubmap_id"): + matched_ids.add(e["hubmap_id"]) + if e.get("uuid") in requested_ids: + e['original_id'] = e.get("uuid") else: - raise he - except Exception as e: - msg = f"Unable to get data to determine accessibility of '{entity_id}'" - logger.exception(msg) - raise Exception(msg) - - entity_dict = vars(sdk_entity) - if entity_dict['entity_type'] not in supported_entity_type_list: - return {'valid_id': False} - - # Make sure all expected elements for the business requirements are in the returned entity. - # Need to determine entity "visibility" using the same rules found in the - - missing_entity_elements = [] - if 'entity_type' not in entity_dict: - missing_entity_elements.append('entity_type') - if 'uuid' not in entity_dict: - missing_entity_elements.append('uuid') - if 'hubmap_id' not in entity_dict: - missing_entity_elements.append('hubmap_id') - if 'status' not in entity_dict: - missing_entity_elements.append('status') - if 'group_name' not in entity_dict: - missing_entity_elements.append('group_name') - if 'group_uuid' not in entity_dict: - missing_entity_elements.append('group_uuid') - if 'contains_human_genetic_sequences' not in entity_dict and \ - entity_dict['entity_type'] == 'Dataset': - missing_entity_elements.append('contains_human_genetic_sequences') - if 'data_access_level' not in entity_dict and \ - entity_dict['entity_type'] == 'Dataset': - missing_entity_elements.append('data_access_level') - if missing_entity_elements: - logger.error(f"Unexpected format for '{entity_id}'" - f" , missing {str(missing_entity_elements)}" - f" from entity={str(entity_dict)}.") - raise Exception(f"Data error determining accessibility of '{entity_id}'") - - if entity_dict['entity_type'] == 'Dataset': - user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PUBLIC) - if not user_access_allowed: - user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_CONSORTIUM) and \ - user_data_access_level['data_access_level'] in [ - HubmapConst.ACCESS_LEVEL_CONSORTIUM \ - , HubmapConst.ACCESS_LEVEL_PROTECTED] - if not user_access_allowed: - user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PROTECTED) and \ - (user_data_access_level['data_access_level'] in [ - HubmapConst.ACCESS_LEVEL_PROTECTED] \ - or entity_dict['group_uuid'] in user_data_access_level['group_membership_ids']) - - if entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PROTECTED: + e['original_id'] = e.get("hubmap_id") + + invalid_ids = list(requested_ids - matched_ids) + + for invalid in invalid_ids: + accessibility_dicts[invalid] = {'valid_id': False} + for entity_dict in entities: + + if entity_dict['entity_type'] not in supported_entity_type_list: + accessibility_dicts[entity_dict['original_id']] = {'valid_id': False} + + # Make sure all expected elements for the business requirements are in the returned entity. + # Need to determine entity "visibility" using the same rules found in the + + missing_entity_elements = [] + if 'entity_type' not in entity_dict: + missing_entity_elements.append('entity_type') + if 'uuid' not in entity_dict: + missing_entity_elements.append('uuid') + if 'hubmap_id' not in entity_dict: + missing_entity_elements.append('hubmap_id') + if 'status' not in entity_dict: + missing_entity_elements.append('status') + if 'group_name' not in entity_dict: + missing_entity_elements.append('group_name') + if 'group_uuid' not in entity_dict: + missing_entity_elements.append('group_uuid') + if 'contains_human_genetic_sequences' not in entity_dict and \ + entity_dict['entity_type'] == 'Dataset': + missing_entity_elements.append('contains_human_genetic_sequences') + if 'data_access_level' not in entity_dict and \ + entity_dict['entity_type'] == 'Dataset': + missing_entity_elements.append('data_access_level') + + if missing_entity_elements: + logger.error(f"Unexpected format for '{entity_dict['original_id']}'" + f" , missing {str(missing_entity_elements)}" + f" from entity={str(entity_dict)}.") + raise Exception(f"Data error determining accessibility of '{entity_dict['origina_id']}'") + + if entity_dict['entity_type'] == 'Dataset': + user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PUBLIC) + if not user_access_allowed: + user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_CONSORTIUM) and \ + user_data_access_level['data_access_level'] in [ + HubmapConst.ACCESS_LEVEL_CONSORTIUM \ + , HubmapConst.ACCESS_LEVEL_PROTECTED] + if not user_access_allowed: + user_access_allowed = (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PROTECTED) and \ + (user_data_access_level['data_access_level'] in [ + HubmapConst.ACCESS_LEVEL_PROTECTED] \ + or entity_dict['group_uuid'] in user_data_access_level['group_membership_ids']) + + if (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PROTECTED) and not user_access_allowed and entity_dict.get('status').lower() == 'published': + abs_path = os.path.join(_globus_public_endpoint_filepath + , entity_dict['uuid']) + elif (entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PROTECTED): + abs_path = os.path.join(_globus_protected_endpoint_filepath + , entity_dict['group_name'] + , entity_dict['uuid']) + elif entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_CONSORTIUM: + abs_path = os.path.join(_globus_consortium_endpoint_filepath + , entity_dict['group_name'] + , entity_dict['uuid']) + elif entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PUBLIC: + abs_path = os.path.join(_globus_public_endpoint_filepath + , entity_dict['uuid']) + else: + raise Exception(f"Unexpected error for {entity_dict['original_id']} of type" + f" {entity_dict['entity_type']} with data access level" + f" {entity_dict['data_access_level']}.") + + entity_accessibility_dict = {'valid_id': True, 'access_allowed': user_access_allowed} + if entity_dict.get('status').lower() == 'published': + entity_accessibility_dict['access_allowed'] = True + if user_access_allowed or entity_dict.get('status').lower() == 'published': + entity_accessibility_dict['hubmap_id'] = entity_dict['hubmap_id'] + entity_accessibility_dict['uuid'] = entity_dict['uuid'] + entity_accessibility_dict['entity_type'] = entity_dict['entity_type'] + entity_accessibility_dict['file_system_path'] = abs_path + accessibility_dicts[entity_dict['original_id']] = entity_accessibility_dict + elif entity_dict['entity_type'] == 'Upload': + user_access_allowed = (user_data_access_level['data_access_level'] in [ + HubmapConst.ACCESS_LEVEL_PROTECTED] + or entity_dict['group_uuid'] in user_data_access_level['group_membership_ids']) abs_path = os.path.join(_globus_protected_endpoint_filepath , entity_dict['group_name'] , entity_dict['uuid']) - elif entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_CONSORTIUM: - abs_path = os.path.join(_globus_consortium_endpoint_filepath - , entity_dict['group_name'] - , entity_dict['uuid']) - elif entity_dict['data_access_level'] == HubmapConst.ACCESS_LEVEL_PUBLIC: - abs_path = os.path.join(_globus_public_endpoint_filepath - , entity_dict['uuid']) + + entity_accessibility_dict = { 'valid_id': True + , 'access_allowed': user_access_allowed} + if user_access_allowed: + entity_accessibility_dict['hubmap_id'] = entity_dict['hubmap_id'] + entity_accessibility_dict['uuid'] = entity_dict['uuid'] + entity_accessibility_dict['entity_type'] = entity_dict['entity_type'] + entity_accessibility_dict['file_system_path'] = abs_path + accessibility_dicts[entity_dict['original_id']] = entity_accessibility_dict else: - raise Exception(f"Unexpected error for {entity_id} of type" - f" {entity_dict['entity_type']} with data access level" - f" {entity_dict['data_access_level']}.") - - entity_accessibility_dict = {'valid_id': True - , 'access_allowed': user_access_allowed} - if user_access_allowed: - entity_accessibility_dict['hubmap_id'] = entity_dict['hubmap_id'] - entity_accessibility_dict['uuid'] = entity_dict['uuid'] - entity_accessibility_dict['entity_type'] = entity_dict['entity_type'] - entity_accessibility_dict['file_system_path'] = abs_path - return entity_accessibility_dict - elif entity_dict['entity_type'] == 'Upload': - user_access_allowed = (user_data_access_level['data_access_level'] in [ - HubmapConst.ACCESS_LEVEL_PROTECTED] - or entity_dict['group_uuid'] in user_data_access_level['group_membership_ids']) - abs_path = os.path.join(_globus_protected_endpoint_filepath - , entity_dict['group_name'] - , entity_dict['uuid']) - - entity_accessibility_dict = { 'valid_id': True - , 'access_allowed': user_access_allowed} - if user_access_allowed: - entity_accessibility_dict['hubmap_id'] = entity_dict['hubmap_id'] - entity_accessibility_dict['uuid'] = entity_dict['uuid'] - entity_accessibility_dict['entity_type'] = entity_dict['entity_type'] - entity_accessibility_dict['file_system_path'] = abs_path - return entity_accessibility_dict - else: - raise Exception(f"Unexpected error for {entity_id} of type" - f" {entity_dict['entity_type']}.") + raise Exception(f"Unexpected error for {entity_dict['original_id']} of type" + f" {entity_dict['entity_type']}.") + return accessibility_dicts From 11585de79cb3b33eaa37adc1d87bf2d59d5311f9 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Wed, 25 Feb 2026 15:56:50 -0500 Subject: [PATCH 2/3] updated comment block for get_entity_accessibility to reflect changes --- src/dataset_helper_object.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/dataset_helper_object.py b/src/dataset_helper_object.py index e05e33a8..78c0e622 100644 --- a/src/dataset_helper_object.py +++ b/src/dataset_helper_object.py @@ -121,14 +121,14 @@ def verify_dataset_title_info(self, dataset_uuid: str, user_token: str) -> array return rslt - # entity_id - UUID or HM_ID - # user_token - The authorization token for the user, which is used to generate an appropriate - # description of the user's access to the entity. + # neo4j_driver - The driver instance for neo4j + # json_payload - A list of ids (HM_ID or UUID) # user_data_access_level - Data access level information for the user, notably including # Globus Group membership information. # - # Returns a JSON Object containing accessibility information for the entity. - # + # Returns a Dict of Dicts where each of the dicts inside is keyed by its original id given + # in the json_payload and contains information about the accessibility of that directory + # including its globus url. def get_entity_accessibility(self, neo4j_driver, json_payload, user_data_access_level: dict = None) -> dict: supported_entity_type_list = ['Dataset', 'Upload'] accessibility_dicts = {} From ef8440b167280a0659cd9399ecb446e235e76682 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Fri, 27 Feb 2026 14:15:58 -0500 Subject: [PATCH 3/3] removed the requirement to have a token to access this entpoint /entities/accessible-data-directories --- src/app.py | 10 +--------- src/dataset_helper_object.py | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/app.py b/src/app.py index ed86d2cb..cb7d8fc7 100644 --- a/src/app.py +++ b/src/app.py @@ -588,22 +588,14 @@ class ReindexPriorityLevelEnum(Enum): def get_accessible_data_directories(): dataset_helper = DatasetHelper() - # If not token is provided or an invalid token is provided, return a 401 error. - if request.headers.get('Authorization') is None: - unauthorized_error('A valid token must be provided.') - # If an invalid token provided, we need to tell the client with a 401 error, rather # than a 500 error later if the token is not good. _validate_token_if_auth_header_exists(request) - # Get user token from Authorization header - # Get the user token from Authorization header - user_token = auth_helper_instance.getAuthorizationTokens(request.headers) - # Get user group information which will be used to determine accessibility on # a per-entity basis. user_data_access_level = auth_helper_instance.getUserDataAccessLevel(request) - + user_data_access_level['group_membership_ids'] = [] if not request.is_json: bad_request_error("A json body and appropriate Content-Type header are required.") json_payload = request.get_json() diff --git a/src/dataset_helper_object.py b/src/dataset_helper_object.py index 78c0e622..cbe59b69 100644 --- a/src/dataset_helper_object.py +++ b/src/dataset_helper_object.py @@ -129,7 +129,7 @@ def verify_dataset_title_info(self, dataset_uuid: str, user_token: str) -> array # Returns a Dict of Dicts where each of the dicts inside is keyed by its original id given # in the json_payload and contains information about the accessibility of that directory # including its globus url. - def get_entity_accessibility(self, neo4j_driver, json_payload, user_data_access_level: dict = None) -> dict: + def get_entity_accessibility(self, neo4j_driver, json_payload, user_data_access_level) -> dict: supported_entity_type_list = ['Dataset', 'Upload'] accessibility_dicts = {}