hubmapconsortium · yuanzhou · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025
diff --git a/src/app.py b/src/app.py
@@ -38,8 +38,6 @@
 from schema.schema_constants import TriggerTypeEnum
 from metadata_constraints import get_constraints, constraints_json_is_valid
 # from lib.ontology import initialize_ubkg, init_ontology, Ontology, UbkgSDK
-from dev_entity_worker import EntityWorker
-import dev_entity_exceptions as entityEx
 
 # HuBMAP commons
 from hubmap_commons import string_helper
@@ -248,23 +246,6 @@ def http_internal_server_error(e):
 except Exception as s3exception:
     logger.critical(s3exception, exc_info=True)
 
-####################################################################################################
-## Initialize a "worker" for the service.
-## For initial transition to "worker" usage, pass in globals of app.py which would eventually
-## be only in the worker and not in app.py.
-####################################################################################################
-entity_worker = None
-try:
-    entity_worker = EntityWorker(   app_config=app.config
-                                    , schema_mgr = schema_manager
-                                    , memcached_client_instance = memcached_client_instance
-                                    , neo4j_driver_instance = neo4j_driver_instance)
-    if not isinstance(entity_worker, EntityWorker):
-        raise Exception("Error instantiating a EntityWorker during startup.")
-    logger.info("EntityWorker instantiated using app.cfg setting.")
-except Exception as e:
-    logger.critical(f"Unable to instantiate a EntityWorker during startup.")
-    logger.error(e, exc_info=True)
 
 ####################################################################################################
 ## REFERENCE DOI Redirection
@@ -632,6 +613,112 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
         entity_visibility = DataVisibilityEnum.PUBLIC
     return entity_visibility
 
+'''
+Retrieve the metadata information for certain data associated with entity.  This method supports
+Dataset entities, and can get the associated data for organs, samples, or donors.
+
+Get associated data dict based upon the user's authorization. The associated data may be
+filtered down if credentials were not presented for full access.
+
+Parameters
+----------
+dataset_dict : dict
+    A dictionary containing all the properties the target entity.
+dataset_visibility : DataVisibilityEnum
+    An indication of the entity itself is public or not, so the associated data can
+    be filtered to match the entity dictionary before being returned.
+valid_user_token : str
+    Either the valid current token for an authenticated user or None.
+user_info : dict
+    Information for the logged-in user to be used for authorization accessing non-public entities.
+associated_data : str
+    A string indicating the associated property to be retrieved, which must be from
+    the values supported by this method.
+
+Returns
+-------
+list
+    A dictionary containing all the properties the target entity.
+'''
+def _get_dataset_associated_data(dataset_dict, dataset_visibility, valid_user_token, request, associated_data: str):
+
+    # Confirm the associated data requested is supported by this method.
+    retrievable_associations = ['organs', 'samples', 'donors']
+    if associated_data.lower() not in retrievable_associations:
+        bad_request_error(  f"Dataset associated data cannot be retrieved for"
+                            f" {associated_data}, only"
+                            f" {COMMA_SEPARATOR.join(retrievable_associations)}.")
+
+    # Confirm the dictionary passed in is for a Dataset entity.
+    if not schema_manager.entity_type_instanceof(dataset_dict['entity_type'], 'Dataset'):
+        bad_request_error(  f"'{dataset_dict['entity_type']}' for"
+                            f" uuid={dataset_dict['uuid']} is not a Dataset or Publication,"
+                            f" so '{associated_data}' can not be retrieved for it.")
+    # Set up fields to be excluded when retrieving the entities associated with
+    # the Dataset.  Organs are one kind of Sample.
+    if associated_data.lower() in ['organs', 'samples']:
+        fields_to_exclude = schema_manager.get_fields_to_exclude('Sample')
+    elif associated_data.lower() in ['donors']:
+        fields_to_exclude = schema_manager.get_fields_to_exclude('Donor')
+    else:
+        logger.error(   f"Expected associated data type to be verified, but got"
+                        f" associated_data.lower()={associated_data.lower()}.")
+        internal_server_error(f"Unexpected error retrieving '{associated_data}' for a Dataset")
+
+    public_entity = (dataset_visibility is DataVisibilityEnum.PUBLIC)
+
+    # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
+    user_authorized = user_in_hubmap_read_group(request=request)
+
+    # For non-public documents, reject the request if the user is not authorized
+    if not public_entity:
+        if valid_user_token is None:
+            forbidden_error(    f"{dataset_dict['entity_type']} for"
+                                f" {dataset_dict['uuid']} is not"
+                                f" accessible without presenting a token.")
+        if not user_authorized:
+            forbidden_error(    f"The requested Dataset has non-public data."
+                                f"  A Globus token with access permission is required.")
+
+    # By now, either the entity is public accessible or the user has the correct access level
+    if associated_data.lower() == 'organs':
+        associated_entities = app_neo4j_queries.get_associated_organs_from_dataset(neo4j_driver_instance,
+                                                                                   dataset_dict['uuid'])
+    elif associated_data.lower() == 'samples':
+        associated_entities = app_neo4j_queries.get_associated_samples_from_dataset(neo4j_driver_instance,
+                                                                                    dataset_dict['uuid'])
+    elif associated_data.lower() == 'donors':
+        associated_entities = app_neo4j_queries.get_associated_donors_from_dataset(neo4j_driver_instance,
+                                                                                   dataset_dict['uuid'])
+    else:
+        logger.error(   f"Expected associated data type to be verified, but got"
+                        f" associated_data.lower()={associated_data.lower()} while retrieving from Neo4j.")
+        internal_server_error(f"Unexpected error retrieving '{associated_data}' from the data store")
+
+    # If there are zero items in the list of associated_entities, return an empty list rather than retrieving.
+    if len(associated_entities) < 1:
+        return []
+
+    # Use the internal token to query the target entity to assure it is returned. This way public
+    # entities can be accessed even if valid_user_token is None.
+    internal_token = auth_helper_instance.getProcessSecret()
+    complete_entities_list = schema_manager.get_complete_entities_list( token=internal_token
+                                                                        , entities_list=associated_entities)
+    # Final result after normalization
+    final_result = schema_manager.normalize_entities_list_for_response(entities_list=complete_entities_list)
+
+    # For public entities, limit the fields in the response unless the authorization presented in the
+    # Request allows the user to see all properties.
+    if public_entity and not user_authorized:
+        filtered_entities_list = []
+        for entity in final_result:
+            final_entity_dict = schema_manager.exclude_properties_from_response(excluded_fields=fields_to_exclude
+                                                                                , output_dict=entity)
+            filtered_entities_list.append(final_entity_dict)
+        final_result = filtered_entities_list
+
+    return final_result
+
 '''
 Retrieve the full provenance metadata information of a given entity by id, as
 produced for metadata.json files.
@@ -644,11 +731,11 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
 
 An HTTP 400 Response is returned for reasons described in the error message, such as
 requesting data for a non-Dataset.
- 
+
 An HTTP 401 Response is returned when a token is presented that is not valid.
 
 An HTTP 403 Response is returned if user is not authorized to access the Dataset, as described above.
-  
+
 An HTTP 404 Response is returned if the requested Dataset is not found.
 
 Parameters
@@ -661,39 +748,95 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
 json
     Valid JSON for the full provenance metadata of the requested Dataset
 '''
-@app.route('/datasets/<id>/prov-metadata', methods = ['GET'])
-def get_provenance_metadata_by_id_for_auth_level(id:Annotated[str, 32]) -> str:
+@app.route('/datasets/<id>/prov-metadata', methods=['GET'])
+def get_provenance_metadata_by_id_for_auth_level(id):
+    # Token is not required, but if an invalid token provided,
+    # we need to tell the client with a 401 error
+    validate_token_if_auth_header_exists(request)
 
-    try:
-        # Get the user's token from the Request for later authorization to access non-public entities.
-        # If an invalid token is presented, reject with an HTTP 401 Response.
-        # N.B. None is a "valid" user_token which may be adequate for access to public data.
-        user_token = entity_worker.get_request_auth_token(request=request)
-
-        # Get the user's token from the Request for later authorization to access non-public entities.
-        user_info = entity_worker.get_request_user_info_with_groups(request=request)
-
-        # Retrieve the expanded metadata for the entity.  If authorization of token or group membership
-        # does not allow access to the entity, exceptions will be raised describing the problem.
-        expanded_entity_metadata = entity_worker.get_expanded_dataset_metadata( dataset_id=id
-                                                                                , valid_user_token=user_token
-                                                                                , user_info=user_info)
-        return jsonify(expanded_entity_metadata)
-    except entityEx.EntityBadRequestException as e_400:
-        return jsonify({'error': e_400.message}), 400
-    except entityEx.EntityUnauthorizedException as e_401:
-        return jsonify({'error': e_401.message}), 401
-    except entityEx.EntityForbiddenException as e_403:
-        return jsonify({'error': e_403.message}), 403
-    except entityEx.EntityNotFoundException as e_404:
-        return jsonify({'error': e_404.message}), 404
-    except entityEx.EntityServerErrorException as e_500:
-        logger.exception(f"An unexpected error occurred during provenance metadata retrieval.")
-        return jsonify({'error': e_500.message}), 500
-    except Exception as e:
-        default_msg = 'An unexpected error occurred retrieving provenance metadata'
-        logger.exception(default_msg)
-        return jsonify({'error': default_msg}), 500
+    # Use the internal token to query the target entity
+    # since public entities don't require user token
+    token = get_internal_token()
+
+    # The argument id that shadows Python's built-in id should be an identifier for a Dataset.
+    # Get the entity dict from cache if exists
+    # Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
+    dataset_dict = query_target_entity(id, token)
+    normalized_entity_type = dataset_dict['entity_type']
+
+    # A bit validation
+    if not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
+        bad_request_error(f"Unable to get the provenance metatdata for this: {normalized_entity_type},"
+                          " supported entity types: Dataset, Publication")
+
+    # Get the generated complete entity result from cache if exists
+    # Otherwise re-generate on the fly
+    complete_dict = schema_manager.get_complete_entity_result(token=token
+                                                              , entity_dict=dataset_dict)
+
+    # Determine if the entity is publicly visible base on its data, only.
+    # To verify if a Collection is public, it is necessary to have its Datasets, which
+    # are populated as triggered data.  So pull back the complete entity for
+    # _get_entity_visibility() to check.
+    entity_scope = _get_entity_visibility(  normalized_entity_type=normalized_entity_type
+                                            ,entity_dict=complete_dict)
+    public_entity = (entity_scope is DataVisibilityEnum.PUBLIC)
+
+    # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
+    user_authorized = user_in_hubmap_read_group(request=request)
+
+    # Get user token from Authorization header
+    user_token = get_user_token(request)
+
+    # For non-public documents, reject the request if the user is not authorized
+    if not public_entity:
+        if user_token is None:
+            forbidden_error(    f"{normalized_entity_type} for {complete_dict['uuid']} is not"
+                                f" accessible without presenting a token.")
+        if not user_authorized:
+            forbidden_error(    f"The requested {normalized_entity_type} has non-public data."
+                                f"  A Globus token with access permission is required.")
+
+    # We'll need to return all the properties including those generated by
+    # `on_read_trigger` to have a complete result e.g., the 'next_revision_uuid' and
+    # 'previous_revision_uuid' being used below.
+    # Collections, however, will filter out only public properties for return.
+
+    # Also normalize the result based on schema
+    final_result = schema_manager.normalize_entity_result_for_response(complete_dict)
+
+    # Identify fields to exclude from non-authorized responses for the entity type.
+    fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type)
+
+    # Response with the dict
+    if public_entity and not user_authorized:
+        final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
+
+    # Retrieve the associated data for the entity, and add it to the expanded dictionary.
+    associated_organ_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Organs')
+    final_result['organs'] = associated_organ_list
+
+    associated_sample_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Samples')
+    final_result['samples'] = associated_sample_list
+
+    associated_donor_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Donors')
+
+    final_result['donors'] = associated_donor_list
+
+    # Return JSON for the dictionary containing the entity metadata as well as metadata for the associated data.
+    return jsonify(final_result)
 
 """
 Retrieve the metadata information of a given entity by id

diff --git a/src/dev_entity_exceptions.py b/src/dev_entity_exceptions.py