From 6a0d18681706dd57f226e7fc3f3630d9a496d412 Mon Sep 17 00:00:00 2001
From: kburke <209327+kburke@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:41:57 -0500
Subject: [PATCH 1/2] Align prov-metadata endpoint with existing API structure

---
 src/app.py                   | 132 +++++---
 src/dev_entity_exceptions.py |  44 ---
 src/dev_entity_worker.py     | 615 -----------------------------------
 3 files changed, 79 insertions(+), 712 deletions(-)
 delete mode 100644 src/dev_entity_exceptions.py
 delete mode 100644 src/dev_entity_worker.py
diff --git a/src/app.py b/src/app.py
index dc2fc7c5..bc87fe63 100644
--- a/src/app.py
+++ b/src/app.py
@@ -38,8 +38,6 @@
 from schema.schema_constants import TriggerTypeEnum
 from metadata_constraints import get_constraints, constraints_json_is_valid
 # from lib.ontology import initialize_ubkg, init_ontology, Ontology, UbkgSDK
-from dev_entity_worker import EntityWorker
-import dev_entity_exceptions as entityEx
 
 # HuBMAP commons
 from hubmap_commons import string_helper
@@ -248,23 +246,6 @@ def http_internal_server_error(e):
 except Exception as s3exception:
     logger.critical(s3exception, exc_info=True)
 
-####################################################################################################
-## Initialize a "worker" for the service.
-## For initial transition to "worker" usage, pass in globals of app.py which would eventually
-## be only in the worker and not in app.py.
-####################################################################################################
-entity_worker = None
-try:
-    entity_worker = EntityWorker(   app_config=app.config
-                                    , schema_mgr = schema_manager
-                                    , memcached_client_instance = memcached_client_instance
-                                    , neo4j_driver_instance = neo4j_driver_instance)
-    if not isinstance(entity_worker, EntityWorker):
-        raise Exception("Error instantiating a EntityWorker during startup.")
-    logger.info("EntityWorker instantiated using app.cfg setting.")
-except Exception as e:
-    logger.critical(f"Unable to instantiate a EntityWorker during startup.")
-    logger.error(e, exc_info=True)
 
 ####################################################################################################
 ## REFERENCE DOI Redirection
@@ -632,6 +613,7 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
         entity_visibility = DataVisibilityEnum.PUBLIC
     return entity_visibility
 
+
 '''
 Retrieve the full provenance metadata information of a given entity by id, as
 produced for metadata.json files.
@@ -644,11 +626,11 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
 
 An HTTP 400 Response is returned for reasons described in the error message, such as
 requesting data for a non-Dataset.
- 
+
 An HTTP 401 Response is returned when a token is presented that is not valid.
 
 An HTTP 403 Response is returned if user is not authorized to access the Dataset, as described above.
-  
+
 An HTTP 404 Response is returned if the requested Dataset is not found.
 
 Parameters
@@ -661,39 +643,83 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
 json
     Valid JSON for the full provenance metadata of the requested Dataset
 '''
-@app.route('/datasets/<id>/prov-metadata', methods = ['GET'])
-def get_provenance_metadata_by_id_for_auth_level(id:Annotated[str, 32]) -> str:
+@app.route('/datasets/<id>/prov-metadata', methods=['GET'])
+def get_provenance_metadata_by_id_for_auth_level(id):
+    # Token is not required, but if an invalid token provided,
+    # we need to tell the client with a 401 error
+    validate_token_if_auth_header_exists(request)
 
-    try:
-        # Get the user's token from the Request for later authorization to access non-public entities.
-        # If an invalid token is presented, reject with an HTTP 401 Response.
-        # N.B. None is a "valid" user_token which may be adequate for access to public data.
-        user_token = entity_worker.get_request_auth_token(request=request)
-
-        # Get the user's token from the Request for later authorization to access non-public entities.
-        user_info = entity_worker.get_request_user_info_with_groups(request=request)
-
-        # Retrieve the expanded metadata for the entity.  If authorization of token or group membership
-        # does not allow access to the entity, exceptions will be raised describing the problem.
-        expanded_entity_metadata = entity_worker.get_expanded_dataset_metadata( dataset_id=id
-                                                                                , valid_user_token=user_token
-                                                                                , user_info=user_info)
-        return jsonify(expanded_entity_metadata)
-    except entityEx.EntityBadRequestException as e_400:
-        return jsonify({'error': e_400.message}), 400
-    except entityEx.EntityUnauthorizedException as e_401:
-        return jsonify({'error': e_401.message}), 401
-    except entityEx.EntityForbiddenException as e_403:
-        return jsonify({'error': e_403.message}), 403
-    except entityEx.EntityNotFoundException as e_404:
-        return jsonify({'error': e_404.message}), 404
-    except entityEx.EntityServerErrorException as e_500:
-        logger.exception(f"An unexpected error occurred during provenance metadata retrieval.")
-        return jsonify({'error': e_500.message}), 500
-    except Exception as e:
-        default_msg = 'An unexpected error occurred retrieving provenance metadata'
-        logger.exception(default_msg)
-        return jsonify({'error': default_msg}), 500
+    # Use the internal token to query the target entity
+    # since public entities don't require user token
+    token = get_internal_token()
+
+    # The argument id that shadows Python's built-in id should be an identifier for a Dataset.
+    # Get the entity dict from cache if exists
+    # Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
+    dataset_dict = query_target_entity(id, token)
+    normalized_entity_type = dataset_dict['entity_type']
+
+    # A bit validation
+    if not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
+        bad_request_error(f"Unable to get the provenance metatdata for this: {normalized_entity_type},"
+                          " supported entity types: Dataset, Publication")
+
+    # Get the generated complete entity result from cache if exists
+    # Otherwise re-generate on the fly
+    complete_dict = schema_manager.get_complete_entity_result(token=token
+                                                              , entity_dict=dataset_dict)
+
+    # Determine if the entity is publicly visible base on its data, only.
+    # To verify if a Collection is public, it is necessary to have its Datasets, which
+    # are populated as triggered data.  So pull back the complete entity for
+    # _get_entity_visibility() to check.
+    entity_scope = _get_entity_visibility(  normalized_entity_type=normalized_entity_type
+                                            ,entity_dict=complete_dict)
+    public_entity = (entity_scope is DataVisibilityEnum.PUBLIC)
+
+    # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
+    user_authorized = user_in_hubmap_read_group(request=request)
+
+    # Get user token from Authorization header
+    user_token = get_user_token(request)
+
+    # For non-public documents, reject the request if the user is not authorized
+    if not public_entity:
+        if user_token is None:
+            forbidden_error(    f"{normalized_entity_type} for {complete_dict['uuid']} is not"
+                                f" accessible without presenting a token.")
+        if not user_authorized:
+            forbidden_error(    f"The requested {normalized_entity_type} has non-public data."
+                                f"  A Globus token with access permission is required.")
+
+    # We'll need to return all the properties including those generated by
+    # `on_read_trigger` to have a complete result e.g., the 'next_revision_uuid' and
+    # 'previous_revision_uuid' being used below.
+    # Collections, however, will filter out only public properties for return.
+
+    # Also normalize the result based on schema
+    final_result = schema_manager.normalize_entity_result_for_response(complete_dict)
+
+    # Identify fields to exclude from non-authorized responses for the entity type.
+    fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type)
+
+    # Response with the dict
+    if public_entity and not user_authorized:
+        final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
+
+    # Retrieve the associated data for the entity, and add it to the expanded dictionary.
+    associated_organ_list_resp = get_associated_organs_from_dataset(id=complete_dict['uuid'])
+    final_result['organs'] = associated_organ_list_resp.json
+
+    associated_sample_list_resp = get_associated_samples_from_dataset(id=complete_dict['uuid'])
+    final_result['samples'] = associated_sample_list_resp.json
+
+    associated_donor_list_resp = get_associated_donors_from_dataset(id=complete_dict['uuid'])
+
+    final_result['donors'] = associated_donor_list_resp.json
+
+    # Return JSON for the dictionary containing the entity metadata as well as metadata for the associated data.
+    return jsonify(final_result)
 
 """
 Retrieve the metadata information of a given entity by id
diff --git a/src/dev_entity_exceptions.py b/src/dev_entity_exceptions.py
deleted file mode 100644
index 724eabe1..00000000
--- a/src/dev_entity_exceptions.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# Exceptions used internally by the service, typically for anticipated exceptions.
-# Knowledge of Flask, HTTP codes, and formatting of the Response should be
-# closer to the endpoint @app.route() methods rather than throughout service.
-class EntityConfigurationException(Exception):
-    """Exception raised when problems loading the service configuration are encountered."""
-    def __init__(self, message='There were problems loading the configuration for the service.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityRequestAuthorizationException(Exception):
-    """Exception raised for authorization info on a Request."""
-    def __init__(self, message='Request authorization problem.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityUnauthorizedException(Exception):
-    """Exception raised when authorization for a resource fails."""
-    def __init__(self, message='Authorization failed.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityForbiddenException(Exception):
-    """Exception raised when authorization for a resource is forbidden."""
-    def __init__(self, message='Access forbidden.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityNotFoundException(Exception):
-    """Exception raised when entity retrieval returns no results."""
-    def __init__(self, message='Not found.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityBadRequestException(Exception):
-    """Exception raised when entity retrieval is flagged as a bad request."""
-    def __init__(self, message='Bad request.'):
-        self.message = message
-        super().__init__(self.message)
-
-class EntityServerErrorException(Exception):
-    """Exception raised when entity retrieval causes an internal server error."""
-    def __init__(self, message='Internal server error.'):
-        self.message = message
-        super().__init__(self.message)
\ No newline at end of file
diff --git a/src/dev_entity_worker.py b/src/dev_entity_worker.py
deleted file mode 100644
index 47a740a3..00000000
--- a/src/dev_entity_worker.py
+++ /dev/null
@@ -1,615 +0,0 @@
-import logging
-from datetime import datetime
-from typing import Annotated
-from requests.exceptions import RequestException
-
-from flask import Response
-
-# Local modules
-import app_neo4j_queries
-from schema import schema_neo4j_queries
-from schema.schema_constants import SchemaConstants
-from schema.schema_constants import DataVisibilityEnum
-
-# HuBMAP commons
-from hubmap_commons.hm_auth import AuthHelper
-from hubmap_commons.S3_worker import S3Worker
-
-import dev_entity_exceptions as entityEx
-
-COMMA_SEPARATOR = ','
-
-class EntityWorker:
-    authHelper = None
-    schemaMgr = None
-    memcachedClient = None
-    neo4jDriver = None
-    MEMCACHED_MODE = False
-    MEMCACHED_PREFIX = 'NONE'
-
-    def __init__(self, app_config, memcached_client_instance, schema_mgr, neo4j_driver_instance):
-        self.logger = logging.getLogger('entity.service')
-
-        if app_config is None:
-            raise entityEx.EntityConfigurationException('Configuration data loaded by the app must be passed to the worker.')
-        try:
-            ####################################################################################################
-            ## Load configuration variables used by this class
-            ####################################################################################################
-            clientId = app_config['APP_CLIENT_ID']
-            clientSecret = app_config['APP_CLIENT_SECRET']
-
-            # Whether Memcached is being used or not
-            # Default to false if the property is missing in the configuration file
-
-            self.MEMCACHED_MODE = app_config['MEMCACHED_MODE'] if 'MEMCACHED_MODE' in app_config else False
-            # Use prefix to distinguish the cached data of same source across different deployments
-            self.MEMCACHED_PREFIX = app_config['MEMCACHED_PREFIX'] if 'MEMCACHED_PREFIX' in app_config else 'NONE'
-
-            self.logger.debug(f"KBKBKB During init from config, MEMCACHED_MODE={self.MEMCACHED_MODE}")
-            self.logger.debug(f"KBKBKB During init from config, MEMCACHED_PREFIX={self.MEMCACHED_PREFIX}")
-
-            ####################################################################################################
-            ## S3Worker initialization
-            ####################################################################################################
-            if 'LARGE_RESPONSE_THRESHOLD' not in app_config \
-                or not isinstance(app_config['LARGE_RESPONSE_THRESHOLD'], int) \
-                or app_config['LARGE_RESPONSE_THRESHOLD'] > 10*(2**20)-1:
-                self.logger.error(f"There is a problem with the LARGE_RESPONSE_THRESHOLD setting in app.cfg."
-                                  f" Defaulting to small value so noticed quickly.")
-                large_response_threshold = 5000000
-            else:
-                large_response_threshold = int(app_config['LARGE_RESPONSE_THRESHOLD'])
-
-            self.logger.info(f"large_response_threshold set to {large_response_threshold}.")
-            self.S3_settings_dict = {   'large_response_threshold': large_response_threshold
-                                        ,'aws_access_key_id': app_config['AWS_ACCESS_KEY_ID']
-                                        ,'aws_secret_access_key': app_config['AWS_SECRET_ACCESS_KEY']
-                                        ,'aws_s3_bucket_name': app_config['AWS_S3_BUCKET_NAME']
-                                        ,'aws_object_url_expiration_in_secs': app_config['AWS_OBJECT_URL_EXPIRATION_IN_SECS']
-                                        ,'service_configured_obj_prefix': app_config['AWS_S3_OBJECT_PREFIX']}
-            try:
-                self.theS3Worker = S3Worker(ACCESS_KEY_ID=self.S3_settings_dict['aws_access_key_id']
-                                            , SECRET_ACCESS_KEY=self.S3_settings_dict['aws_secret_access_key']
-                                            , S3_BUCKET_NAME=self.S3_settings_dict['aws_s3_bucket_name']
-                                            , S3_OBJECT_URL_EXPIRATION_IN_SECS=self.S3_settings_dict['aws_object_url_expiration_in_secs']
-                                            , LARGE_RESPONSE_THRESHOLD=self.S3_settings_dict['large_response_threshold']
-                                            , SERVICE_S3_OBJ_PREFIX=self.S3_settings_dict['service_configured_obj_prefix'])
-                self.logger.info("self.theS3Worker initialized")
-            except Exception as e:
-                self.logger.error(f"Error initializing self.theS3Worker - '{str(e)}'.", exc_info=True)
-                raise entityEx.EntityConfigurationException(f"Unexpected error: {str(e)}")
-
-        except KeyError as ke:
-            self.logger.error(f"Expected configuration failed to load %s from app_config={app_config}"
-                              f" due to ke={str(ke)}")
-            raise entityEx.EntityConfigurationException('Expected configuration failed to load. See the logs.')
-
-        if schema_mgr is None:
-            raise entityEx.EntityConfigurationException('A schema manager must be passed to the worker until it instantiates its own.')
-        else:
-            self.schemaMgr = schema_mgr
-
-        if neo4j_driver_instance is None:
-            raise entityEx.EntityConfigurationException('A Neo4j driver must be passed to the worker until it instantiates its own.')
-        else:
-            self.neo4jDriver = neo4j_driver_instance
-
-        if memcached_client_instance is None:
-            self.logger.info('No cache client passed to the worker, running without memcache.')
-        self.memcachedClient = memcached_client_instance
-
-        ####################################################################################################
-        ## AuthHelper initialization
-        ####################################################################################################
-        if not clientId  or not clientSecret:
-            raise entityEx.EntityConfigurationException('Globus client id and secret are required in AuthHelper')
-        # Initialize AuthHelper class and ensure singleton
-        try:
-            if not AuthHelper.isInitialized():
-                self.authHelper = AuthHelper.create(    clientId,
-                                                        clientSecret)
-                self.logger.info('Initialized AuthHelper class successfully')
-            else:
-                self.authHelper = AuthHelper.instance()
-        except Exception as e:
-            msg = 'Failed to initialize the AuthHelper class'
-            # Log the full stack trace, prepend a line with our message
-            self.logger.exception(msg)
-            raise entityEx.EntityConfigurationException(msg)
-
-    def _user_in_hubmap_read_group(self, user_info):
-        if user_info is None:
-            return False
-        try:
-            # The property 'hmgroupids' is ALWAYS in the output with using schema_manager.get_user_info()
-            # when the token in request is a groups token
-            hubmap_read_group_uuid = self.authHelper.groupNameToId('HuBMAP-READ')['uuid']
-        except Exception as e:
-            # Log the full stack trace, prepend a line with our message
-            self.logger.exception(e)
-
-            # If the token is not a groups token, no group information available
-            # The commons.hm_auth.AuthCache would return a Response with 500 error message
-            # We treat such cases as the user not in the HuBMAP-READ group
-            # KBKBKB @TODO clarify these comments...there is no token in this scope...
-            # KBKBKB @TODO https://github.com/hubmapconsortium/commons/blob/6d2f7b323191b272d97b79bc41b1a04295444006/hubmap_commons/hm_auth.py#L482-L488
-            # KBKBKB @TODO seems to indicate we will get None rather than any kind of Response...
-            return False
-
-        return ('hmgroupids' in user_info and hubmap_read_group_uuid in user_info['hmgroupids'])
-
-    '''
-    Return the "visibility" of an entity as DataVisibilityEnum value.  Determination of
-    "public" or "non-public" is specific to entity type.
-    
-    Parameters
-    ----------
-    entity_dict : dict
-        A Python dictionary retrieved for the entity 
-    
-    Returns
-    -------
-    DataVisibilityEnum
-        A value identifying if the entity is public or non-public
-    '''
-    def _get_entity_visibility(self, entity_dict):
-        normalized_entity_type = entity_dict['entity_type']
-        if normalized_entity_type not in self.schemaMgr.get_all_entity_types():
-            self.logger.log(logging.ERROR
-                            ,f"normalized_entity_type={normalized_entity_type}"
-                              f" not recognized by schema_manager.get_all_entity_types().")
-            raise entityEx.EntityBadRequestException(f"'{normalized_entity_type}' is not a recognized entity type.")
-
-        # Use the characteristics of the entity's data to classify the entity's visibility, so
-        # it can be used along with the user's authorization to determine access.
-        entity_visibility = DataVisibilityEnum.NONPUBLIC
-        if self.schemaMgr.entity_type_instanceof(normalized_entity_type, 'Dataset') and \
-                entity_dict['status'].lower() == SchemaConstants.DATASET_STATUS_PUBLISHED:
-            entity_visibility = DataVisibilityEnum.PUBLIC
-        elif self.schemaMgr.entity_type_instanceof(normalized_entity_type, 'Collection') and \
-                'registered_doi' in entity_dict and \
-                'doi_url' in entity_dict and \
-                'contacts' in entity_dict and \
-                'contributors' in entity_dict and \
-                len(entity_dict['contacts']) > 0 and \
-                len(entity_dict['contributors']) > 0:
-            # Get the data_access_level for each Dataset in the Collection from Neo4j
-            collection_dataset_statuses = schema_neo4j_queries.get_collection_datasets_statuses(self.neo4jDriver
-                                                                                                , entity_dict['uuid'])
-
-            # If the list of distinct statuses for Datasets in the Collection only has one entry, and
-            # it is 'published', the Collection is public
-            if len(collection_dataset_statuses) == 1 and \
-                    collection_dataset_statuses[0].lower() == SchemaConstants.DATASET_STATUS_PUBLISHED:
-                entity_visibility = DataVisibilityEnum.PUBLIC
-        elif normalized_entity_type == 'Upload':
-            # Upload entities require authorization to access, so keep the
-            # entity_visibility as non-public, as initialized outside block.
-            pass
-        elif normalized_entity_type in ['Donor', 'Sample'] and \
-                entity_dict['data_access_level'] == SchemaConstants.ACCESS_LEVEL_PUBLIC:
-            entity_visibility = DataVisibilityEnum.PUBLIC
-        return entity_visibility
-
-    '''
-    Get target entity dict from the cache or from Neo4j query for the given id. When caching enabled,
-    data retrieved from Neo4j will be cached.
-
-    Parameters
-    ----------
-    entity_id : str
-        The uuid or hubmap_id of target entity
-
-    Returns
-    -------
-    dict
-        A dictionary of entity details either from the cache or a neo4j query.
-    '''
-    def _query_target_entity(self, entity_id):
-        # Get the entity dict from cache if exists
-        # Otherwise query against uuid-api and neo4j to get the entity dict if the entity_id exists
-
-        cache_result = None
-
-        try:
-            # Get cached ids if exist otherwise retrieve from UUID-API
-            hubmap_ids = self.schemaMgr.get_hubmap_ids(entity_id.strip())
-
-            # Get the target uuid if all good
-            uuid = hubmap_ids['hm_uuid']
-
-            # Look up the cache again by the uuid since we only use uuid in the cache key
-            if self.MEMCACHED_MODE and self.MEMCACHED_PREFIX and self.memcachedClient:
-                cache_key = f'{self.MEMCACHED_PREFIX}_neo4j_{uuid}'
-                cache_result = self.memcachedClient.get(cache_key)
-
-            if cache_result is None:
-                self.logger.info(f'Neo4j entity cache of {uuid} not found or expired at time {datetime.now()}')
-
-                # Make a new query against neo4j
-                entity_dict = schema_neo4j_queries.get_entity(neo4j_driver=self.neo4jDriver
-                                                              ,uuid=uuid)
-
-                # The uuid exists via uuid-api doesn't mean it also exists in Neo4j
-                if not entity_dict:
-                    msg = f"Entity of entity_id: {entity_id} not found in Neo4j"
-                    self.logger.debug(msg)
-                    raise entityEx.EntityNotFoundException(msg)
-
-                # Save to cache
-                if self.MEMCACHED_MODE and self.MEMCACHED_PREFIX and self.memcachedClient:
-                    self.logger.info(f'Creating neo4j entity result cache of {uuid} at time {datetime.now()}')
-                    cache_key = f'{self.MEMCACHED_PREFIX}_neo4j_{uuid}'
-                    self.memcachedClient.set(cache_key, entity_dict, expire=SchemaConstants.MEMCACHED_TTL)
-            else:
-                self.logger.info(f"Using neo4j entity cache of UUID {uuid} at time {datetime.now()}")
-                self.logger.debug(cache_result)
-                entity_dict = cache_result
-        except RequestException as e:
-            # Due to the use of response.raise_for_status() in schema_manager.get_hubmap_ids()
-            # we can access the status codes from the exception
-            status_code = e.response.status_code
-
-            if status_code == 400:
-                raise entityEx.EntityBadRequestException(e.response.text)
-            if status_code == 404:
-                raise entityEx.EntityNotFoundException(e.response.text)
-            else:
-                raise entityEx.EntityUnauthorizedException(e.response.text)
-
-        # One final return
-        return entity_dict
-
-    ''''
-    Retrieve the metadata information for an entity by id.
-    
-    Get target entity dict based upon the user's authorization. The full dictionary may be
-    filtered down if credentials were not presented for full access.
-    
-    Parameters
-    ----------
-    entity_id : str
-        The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target entity .
-    valid_user_token : str
-        Either the valid current token for an authenticated user or None.
-    user_info : dict
-        Information for the logged-in user to be used for authorization accessing non-public entities.
-        
-    Returns
-    -------
-    dict
-        A dictionary containing all the properties the target entity.
-    '''
-    def _get_expanded_entity_for_auth_level(self, entity_dict:dict, valid_user_token:Annotated[str, 32]
-                                            , user_info:dict) -> dict:
-
-        # Use the internal token to query the target entity to assure it is returned. This way public
-        # entities can be accessed even if valid_user_token is None.
-        internal_token = self.authHelper.getProcessSecret()
-        normalized_entity_type = entity_dict['entity_type']
-
-        # Get the generated complete entity result from cache if exists
-        # Otherwise re-generate on the fly
-        complete_dict = self.schemaMgr.get_complete_entity_result(token=internal_token
-                                                                  , entity_dict=entity_dict)
-
-        # Determine if the entity is publicly visible base on its data, only.
-        # To verify if a Collection is public, it is necessary to have its Datasets, which
-        # are populated as triggered data.  So pull back the complete entity for
-        # _get_entity_visibility() to check.
-        entity_scope = self._get_entity_visibility(entity_dict=complete_dict)
-        public_entity = (entity_scope is DataVisibilityEnum.PUBLIC)
-
-        # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
-        user_authorized = self._user_in_hubmap_read_group(user_info=user_info)
-
-        # For non-public documents, reject the request if the user is not authorized
-        if not public_entity:
-            if valid_user_token is None:
-                raise entityEx.EntityForbiddenException(f"{normalized_entity_type} for {entity_dict['uuid']} is not"
-                                                        f" accessible without presenting a token.")
-            if not user_authorized:
-                raise entityEx.EntityForbiddenException(f"The requested {normalized_entity_type} has non-public data."
-                                                        f"  A Globus token with access permission is required.")
-
-        # We'll need to return all the properties including those generated by
-        # `on_read_trigger` to have a complete result e.g., the 'next_revision_uuid' and
-        # 'previous_revision_uuid' being used below.
-        # Collections, however, will filter out only public properties for return.
-
-        # Also normalize the result based on schema
-        final_result = self.schemaMgr.normalize_entity_result_for_response(complete_dict)
-
-        # Identify fields to exclude from non-authorized responses for the entity type.
-        fields_to_exclude = self.schemaMgr.get_fields_to_exclude(normalized_entity_type)
-
-        # Response with the dict
-        #if public_entity and not user_in_hubmap_read_group(request):
-        if public_entity and not user_authorized:
-            final_result = self.schemaMgr.exclude_properties_from_response(fields_to_exclude, final_result)
-        return final_result
-
-    '''
-    Retrieve the metadata information for certain data associated with entity.  This method supports
-    Dataset entities, and can get the associated data for organs, samples, or donors.
-    
-    Get associated data dict based upon the user's authorization. The associated data may be
-    filtered down if credentials were not presented for full access.
-    
-    Parameters
-    ----------
-    dataset_dict : dict
-        A dictionary containing all the properties the target entity.
-    dataset_visibility : DataVisibilityEnum
-        An indication of the entity itself is public or not, so the associated data can
-        be filtered to match the entity dictionary before being returned.
-    valid_user_token : str
-        Either the valid current token for an authenticated user or None.
-    user_info : dict
-        Information for the logged-in user to be used for authorization accessing non-public entities.
-    associated_data : str
-        A string indicating the associated property to be retrieved, which must be from
-        the values supported by this method.
-        
-    Returns
-    -------
-    list
-        A dictionary containing all the properties the target entity.
-    '''
-    def _get_dataset_associated_data(   self, dataset_dict:dict, dataset_visibility:DataVisibilityEnum
-                                        , valid_user_token:Annotated[str, 32], user_info:dict
-                                        , associated_data:str) -> list:
-
-        # Confirm the associated data requested is supported by this method.
-        retrievable_associations = ['organs', 'samples', 'donors']
-        if associated_data.lower() not in retrievable_associations:
-            raise entityEx.EntityBadRequestException(   f"Dataset associated data cannot be retrieved for"
-                                                        f" {associated_data}, only"
-                                                        f" {COMMA_SEPARATOR.join(retrievable_associations)}.")
-
-        # Confirm the dictionary passed in is for a Dataset entity.
-        if not self.schemaMgr.entity_type_instanceof(dataset_dict['entity_type'], 'Dataset'):
-            raise entityEx.EntityBadRequestException(   f"'{dataset_dict['entity_type']}' for"
-                                                        f" uuid={dataset_dict['uuid']} is not a Dataset or Publication,"
-                                                        f" so '{associated_data}' can not be retrieved for it.")
-        # Set up fields to be excluded when retrieving the entities associated with
-        # the Dataset.  Organs are one kind of Sample.
-        if associated_data.lower() in ['organs', 'samples']:
-            fields_to_exclude = self.schemaMgr.get_fields_to_exclude('Sample')
-        elif associated_data.lower() in ['donors']:
-            fields_to_exclude = self.schemaMgr.get_fields_to_exclude('Donor')
-        else:
-            self.logger.error(  f"Expected associated data type to be verified, but got"
-                                f" associated_data.lower()={associated_data.lower()}.")
-            raise entityEx.EntityServerErrorException(f"Unexpected error retrieving '{associated_data}' for a Dataset")
-
-        public_entity = (dataset_visibility is DataVisibilityEnum.PUBLIC)
-
-        # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
-        user_authorized = self._user_in_hubmap_read_group(user_info=user_info)
-
-        # For non-public documents, reject the request if the user is not authorized
-        if not public_entity:
-            if valid_user_token is None:
-                raise entityEx.EntityForbiddenException(f"{dataset_dict['entity_type']} for"
-                                                        f" {dataset_dict['uuid']} is not"
-                                                        f" accessible without presenting a token.")
-            if not user_authorized:
-                raise entityEx.EntityForbiddenException(f"The requested Dataset has non-public data."
-                                                        f"  A Globus token with access permission is required.")
-
-        # By now, either the entity is public accessible or the user has the correct access level
-        if associated_data.lower() == 'organs':
-            associated_entities = app_neo4j_queries.get_associated_organs_from_dataset( self.neo4jDriver,
-                                                                                        dataset_dict['uuid'])
-        elif associated_data.lower() == 'samples':
-            associated_entities = app_neo4j_queries.get_associated_samples_from_dataset(self.neo4jDriver,
-                                                                                        dataset_dict['uuid'])
-        elif associated_data.lower() == 'donors':
-            associated_entities = app_neo4j_queries.get_associated_donors_from_dataset( self.neo4jDriver,
-                                                                                        dataset_dict['uuid'])
-        else:
-            self.logger.error(  f"Expected associated data type to be verified, but got"
-                                f" associated_data.lower()={associated_data.lower()} while retrieving from Neo4j.")
-            raise entityEx.EntityServerErrorException(f"Unexpected error retrieving '{associated_data}' from the data store")
-
-        # If there are zero items in the list of associated_entities, return an empty list rather than retrieving.
-        if len(associated_entities) < 1:
-            return []
-
-        # Use the internal token to query the target entity to assure it is returned. This way public
-        # entities can be accessed even if valid_user_token is None.
-        internal_token = self.authHelper.getProcessSecret()
-        complete_entities_list = self.schemaMgr.get_complete_entities_list(token=internal_token
-                                                                           , entities_list=associated_entities)
-        # Final result after normalization
-        final_result = self.schemaMgr.normalize_entities_list_for_response(entities_list=complete_entities_list)
-
-        # For public entities, limit the fields in the response unless the authorization presented in the
-        # Request allows the user to see all properties.
-        if public_entity and not user_authorized:
-            filtered_entities_list = []
-            for entity in final_result:
-                final_entity_dict = self.schemaMgr.exclude_properties_from_response( excluded_fields=fields_to_exclude
-                                                                                    , output_dict=entity)
-                filtered_entities_list.append(final_entity_dict)
-            final_result = filtered_entities_list
-
-        return final_result
-
-    '''
-    Retrieve authentication bearer token presented on the request.  If no token is in the request, return None as
-    a valid user token value indicating public access.
-
-    Parameters
-    ----------
-    request : flask.request
-        The Flask http request object
-
-    Returns
-    -------
-    str
-        The bearer token presented on the request for authentication.
-    '''
-    def get_request_auth_token(self, request) -> str:
-        if 'Authorization' not in request.headers:
-            return None
-
-        # Get user token from Authorization header
-        # getAuthorizationTokens() also handles MAuthorization header but we are not using that here
-        try:
-            # HTTP header names are case-insensitive
-            # request.headers.get('Authorization') returns None if the header doesn't exist
-            request_token = self.authHelper.getAuthorizationTokens(request.headers)
-        except Exception as e:
-            msg = "Failed to parse the Authorization token by calling commons.auth_helper.getAuthorizationTokens()"
-            # Log the full stack trace, prepend a line with our message
-            self.logger.exception(msg)
-            raise entityEx.EntityRequestAuthorizationException(msg)
-
-        # When the token is a flask.Response instance, it MUST be a 401 error with message.
-        # That's how commons.auth_helper.getAuthorizationTokens() was designed.
-        if isinstance(request_token, Response):
-            # The Response.data returns binary string, need to decode
-            raise entityEx.EntityUnauthorizedException(request_token.get_data().decode())
-
-        # Make sure the token is not invalid or expired by calling a method with
-        # the side-effect of returning a Response
-        user_info = self.authHelper.getUserInfo(request_token, False)
-        if isinstance(user_info, Response):
-            # The Response.data returns binary string, need to decode
-            raise entityEx.EntityUnauthorizedException(user_info.get_data().decode())
-
-        return request_token
-
-    '''
-    Retrieve authorization information for the bearer token presented on the request, including
-    Globus Group information.
-
-    Parameters
-    ----------
-    request : flask.request
-        The Flask http request object
-
-    Returns
-    -------
-    dict
-        The user information, including Globus Group data, which can be used for authorization.
-    '''
-    def get_request_user_info_with_groups(self, request):
-        try:
-            # The property 'hmgroupids' is ALWAYS in the output with using schema_manager.get_user_info()
-            # when the token in request is a groups token
-            user_info = self.authHelper.getUserInfoUsingRequest(httpReq=request
-                                                                , getGroups=True)
-            self.logger.info("======user_info======")
-            self.logger.info(user_info)
-            if isinstance(user_info, Response):
-                # Bubble up the actual error message from commons
-                # The Response.data returns binary string, need to decode
-                msg = user_info.get_data().decode()
-                # Log the full stack trace, prepend a line with our message
-                self.logger.exception(msg)
-                return None
-        except Exception as e:
-            # Log the full stack trace, prepend a line with our message
-            self.logger.exception(e)
-            return None
-        return user_info
-
-    ''''
-    Retrieve expanded metadata information for an entity by id, including metadata for
-    associated data.
-
-    Get target entity dict based upon the user's authorization. The full dictionary may be
-    filtered down if credentials were not presented for full access.
-
-    Parameters
-    ----------
-    entity_id : str
-        The HuBMAP ID (e.g. HBM123.ABCD.456) or UUID of target entity .
-    valid_user_token : str
-        Either the valid current token for an authenticated user or None.
-    user_info : dict
-        Information for the logged-in user to be used for authorization accessing non-public entities.
-
-    Returns
-    -------
-    dict
-        A dictionary containing all the properties the target entity.
-    '''
-    def get_expanded_dataset_metadata(self, dataset_id:Annotated[str, 32], valid_user_token:Annotated[str, 32]
-                                     , user_info:dict) -> dict:
-        # Retrieve the metadata dictionary for the Dataset, which will be expanded later to hold entries for the
-        # associated data.
-        dataset_dict = self._query_target_entity(entity_id=dataset_id)
-
-        # Confirm the dataset_id passed in is for a Dataset entity.
-        if not self.schemaMgr.entity_type_instanceof(   entity_type=dataset_dict['entity_type']
-                                                        , entity_class='Dataset'):
-            raise entityEx.EntityBadRequestException(f"Only Dataset provenance metadata can be retrieved."
-                                                     f" An entity of type '{dataset_dict['entity_type']}'"
-                                                     f" for uuid={dataset_dict['uuid']} is not supported.")
-
-        expanded_dataset_dict = self._get_expanded_entity_for_auth_level(   entity_dict=dataset_dict
-                                                                            , valid_user_token=valid_user_token
-                                                                            , user_info=user_info)
-
-        # Determine if the entity is publicly visible base on its data, only.
-        # To verify if a Collection is public, it is necessary to have its Datasets, which
-        # are populated as triggered data.  So pull back the complete entity for
-        # _get_entity_visibility() to check.
-        dataset_scope = self._get_entity_visibility(entity_dict=expanded_dataset_dict)
-
-        # Retrieve the associated data for the entity, and add it to the expanded dictionary.
-        associated_organ_list = self._get_dataset_associated_data(  dataset_dict=expanded_dataset_dict
-                                                                    , dataset_visibility=dataset_scope
-                                                                    , valid_user_token=valid_user_token
-                                                                    , user_info=user_info
-                                                                    , associated_data='Organs')
-        expanded_dataset_dict['organs'] = associated_organ_list
-
-        associated_sample_list = self._get_dataset_associated_data( dataset_dict=expanded_dataset_dict
-                                                                    , dataset_visibility=dataset_scope
-                                                                    , valid_user_token=valid_user_token
-                                                                    , user_info=user_info
-                                                                    , associated_data='Samples')
-        expanded_dataset_dict['samples'] = associated_sample_list
-
-        associated_donor_list = self._get_dataset_associated_data(  dataset_dict=expanded_dataset_dict
-                                                                    , dataset_visibility=dataset_scope
-                                                                    , valid_user_token=valid_user_token
-                                                                    , user_info=user_info
-                                                                    , associated_data='Donors')
-
-        expanded_dataset_dict['donors'] = associated_donor_list
-
-        # Return the dictionary containing the entity metadata as well as metadata for the associated data.
-        return expanded_dataset_dict
-
-    # KBKBKB @TODO for future use of /datasets/<id>/organs endpoint of app.py needs.
-    def get_organs_associated_with_dataset(self, dataset_id: Annotated[str, 32], valid_user_token: Annotated[str, 32]
-        , user_info: dict) -> list:
-
-        dataset_dict = self._get_entity_by_id_for_auth_level(entity_id=dataset_id
-                                                             , valid_user_token=valid_user_token
-                                                             , user_info=user_info)
-        # Determine if the entity is publicly visible base on its data, only.
-        # To verify if a Collection is public, it is necessary to have its Datasets, which
-        # are populated as triggered data.  So pull back the complete entity for
-        # _get_entity_visibility() to check.
-        entity_scope = self._get_entity_visibility(entity_dict=dataset_dict)
-
-        associated_organ_list = self._get_dataset_associated_data(  dataset_dict=dataset_dict
-                                                                    , dataset_visibility=entity_scope
-                                                                    , valid_user_token=valid_user_token
-                                                                    , user_info=user_info
-                                                                    , associated_data='Organs')
-
-        # If there are zero items in the list associated_organs, then there are no associated
-        # Organs and a 404 will be returned.
-        if len(associated_organ_list) < 1:
-            raise entityEx.EntityNotFoundException(f'Dataset {dataset_id} does not have any associated organs')
-
-        return associated_organ_list

From f1cea350dd5c8b860750f5a113305090d134fa62 Mon Sep 17 00:00:00 2001
From: kburke <209327+kburke@users.noreply.github.com>
Date: Mon, 27 Jan 2025 12:47:16 -0500
Subject: [PATCH 2/2] Introduce method for retrieving Dataset associated
 entities to app.py.

---
 src/app.py | 135 +++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 126 insertions(+), 9 deletions(-)

diff --git a/src/app.py b/src/app.py
index bc87fe63..fb5c970e 100644
--- a/src/app.py
+++ b/src/app.py
@@ -613,6 +613,111 @@ def _get_entity_visibility(normalized_entity_type, entity_dict):
         entity_visibility = DataVisibilityEnum.PUBLIC
     return entity_visibility
 
+'''
+Retrieve the metadata information for certain data associated with entity.  This method supports
+Dataset entities, and can get the associated data for organs, samples, or donors.
+
+Get associated data dict based upon the user's authorization. The associated data may be
+filtered down if credentials were not presented for full access.
+
+Parameters
+----------
+dataset_dict : dict
+    A dictionary containing all the properties the target entity.
+dataset_visibility : DataVisibilityEnum
+    An indication of the entity itself is public or not, so the associated data can
+    be filtered to match the entity dictionary before being returned.
+valid_user_token : str
+    Either the valid current token for an authenticated user or None.
+user_info : dict
+    Information for the logged-in user to be used for authorization accessing non-public entities.
+associated_data : str
+    A string indicating the associated property to be retrieved, which must be from
+    the values supported by this method.
+
+Returns
+-------
+list
+    A dictionary containing all the properties the target entity.
+'''
+def _get_dataset_associated_data(dataset_dict, dataset_visibility, valid_user_token, request, associated_data: str):
+
+    # Confirm the associated data requested is supported by this method.
+    retrievable_associations = ['organs', 'samples', 'donors']
+    if associated_data.lower() not in retrievable_associations:
+        bad_request_error(  f"Dataset associated data cannot be retrieved for"
+                            f" {associated_data}, only"
+                            f" {COMMA_SEPARATOR.join(retrievable_associations)}.")
+
+    # Confirm the dictionary passed in is for a Dataset entity.
+    if not schema_manager.entity_type_instanceof(dataset_dict['entity_type'], 'Dataset'):
+        bad_request_error(  f"'{dataset_dict['entity_type']}' for"
+                            f" uuid={dataset_dict['uuid']} is not a Dataset or Publication,"
+                            f" so '{associated_data}' can not be retrieved for it.")
+    # Set up fields to be excluded when retrieving the entities associated with
+    # the Dataset.  Organs are one kind of Sample.
+    if associated_data.lower() in ['organs', 'samples']:
+        fields_to_exclude = schema_manager.get_fields_to_exclude('Sample')
+    elif associated_data.lower() in ['donors']:
+        fields_to_exclude = schema_manager.get_fields_to_exclude('Donor')
+    else:
+        logger.error(   f"Expected associated data type to be verified, but got"
+                        f" associated_data.lower()={associated_data.lower()}.")
+        internal_server_error(f"Unexpected error retrieving '{associated_data}' for a Dataset")
+
+    public_entity = (dataset_visibility is DataVisibilityEnum.PUBLIC)
+
+    # Set a variable reflecting the user's authorization by being in the HuBMAP-READ Globus Group
+    user_authorized = user_in_hubmap_read_group(request=request)
+
+    # For non-public documents, reject the request if the user is not authorized
+    if not public_entity:
+        if valid_user_token is None:
+            forbidden_error(    f"{dataset_dict['entity_type']} for"
+                                f" {dataset_dict['uuid']} is not"
+                                f" accessible without presenting a token.")
+        if not user_authorized:
+            forbidden_error(    f"The requested Dataset has non-public data."
+                                f"  A Globus token with access permission is required.")
+
+    # By now, either the entity is public accessible or the user has the correct access level
+    if associated_data.lower() == 'organs':
+        associated_entities = app_neo4j_queries.get_associated_organs_from_dataset(neo4j_driver_instance,
+                                                                                   dataset_dict['uuid'])
+    elif associated_data.lower() == 'samples':
+        associated_entities = app_neo4j_queries.get_associated_samples_from_dataset(neo4j_driver_instance,
+                                                                                    dataset_dict['uuid'])
+    elif associated_data.lower() == 'donors':
+        associated_entities = app_neo4j_queries.get_associated_donors_from_dataset(neo4j_driver_instance,
+                                                                                   dataset_dict['uuid'])
+    else:
+        logger.error(   f"Expected associated data type to be verified, but got"
+                        f" associated_data.lower()={associated_data.lower()} while retrieving from Neo4j.")
+        internal_server_error(f"Unexpected error retrieving '{associated_data}' from the data store")
+
+    # If there are zero items in the list of associated_entities, return an empty list rather than retrieving.
+    if len(associated_entities) < 1:
+        return []
+
+    # Use the internal token to query the target entity to assure it is returned. This way public
+    # entities can be accessed even if valid_user_token is None.
+    internal_token = auth_helper_instance.getProcessSecret()
+    complete_entities_list = schema_manager.get_complete_entities_list( token=internal_token
+                                                                        , entities_list=associated_entities)
+    # Final result after normalization
+    final_result = schema_manager.normalize_entities_list_for_response(entities_list=complete_entities_list)
+
+    # For public entities, limit the fields in the response unless the authorization presented in the
+    # Request allows the user to see all properties.
+    if public_entity and not user_authorized:
+        filtered_entities_list = []
+        for entity in final_result:
+            final_entity_dict = schema_manager.exclude_properties_from_response(excluded_fields=fields_to_exclude
+                                                                                , output_dict=entity)
+            filtered_entities_list.append(final_entity_dict)
+        final_result = filtered_entities_list
+
+    return final_result
 
 '''
 Retrieve the full provenance metadata information of a given entity by id, as
@@ -708,15 +813,27 @@ def get_provenance_metadata_by_id_for_auth_level(id):
         final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
 
     # Retrieve the associated data for the entity, and add it to the expanded dictionary.
-    associated_organ_list_resp = get_associated_organs_from_dataset(id=complete_dict['uuid'])
-    final_result['organs'] = associated_organ_list_resp.json
-
-    associated_sample_list_resp = get_associated_samples_from_dataset(id=complete_dict['uuid'])
-    final_result['samples'] = associated_sample_list_resp.json
-
-    associated_donor_list_resp = get_associated_donors_from_dataset(id=complete_dict['uuid'])
-
-    final_result['donors'] = associated_donor_list_resp.json
+    associated_organ_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Organs')
+    final_result['organs'] = associated_organ_list
+
+    associated_sample_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Samples')
+    final_result['samples'] = associated_sample_list
+
+    associated_donor_list = _get_dataset_associated_data(   dataset_dict=final_result
+                                                            , dataset_visibility=entity_scope
+                                                            , valid_user_token=user_token
+                                                            , request=request
+                                                            , associated_data='Donors')
+
+    final_result['donors'] = associated_donor_list
 
     # Return JSON for the dictionary containing the entity metadata as well as metadata for the associated data.
     return jsonify(final_result)