hubmapconsortium · yuanzhou · Sep 11, 2025 · Sep 10, 2025 · Sep 11, 2025
diff --git a/src/app.py b/src/app.py
@@ -1464,9 +1464,8 @@ def update_entity(id):
         if ('direct_ancestor_uuids' in json_data_dict) and (json_data_dict['direct_ancestor_uuids']):
             has_direct_ancestor_uuids = True
 
-            # `direct_ancestor_uuids` is required for updating a Dataset.
-            # Verify all of the direct ancestor UUIDs exist in the Neo4j graph.
-            # Form an error response if an Exception is raised.
+            # Verify all of the provided direct ancestor UUIDs exist
+            # Form an error response if an Exception is raised
             try:
                 app_neo4j_queries.uuids_all_exist(neo4j_driver=neo4j_driver_instance
                                                   , uuids=json_data_dict['direct_ancestor_uuids'])
@@ -1491,27 +1490,10 @@ def update_entity(id):
         if ('dataset_uuids_to_link' in json_data_dict) and (json_data_dict['dataset_uuids_to_link']):
             has_dataset_uuids_to_link = True
 
-            # Check existence of those datasets to be linked
-            # If one of the datasets to be linked appears to be already linked,
-            # neo4j query won't create the new linkage due to the use of `MERGE`
-            for dataset_uuid in json_data_dict['dataset_uuids_to_link']:
-                dataset_dict = query_target_entity(dataset_uuid, user_token)
-                # Also make sure it's a Dataset (or publication 2/17/23)
-                if dataset_dict['entity_type'] not in ['Dataset', 'Publication']:
-                    bad_request_error(f"The uuid: {dataset_uuid} is not a Dataset or Publication, cannot be linked to this Upload")
-
         has_dataset_uuids_to_unlink = False
         if ('dataset_uuids_to_unlink' in json_data_dict) and (json_data_dict['dataset_uuids_to_unlink']):
             has_dataset_uuids_to_unlink = True
 
-            # Check existence of those datasets to be unlinked
-            # If one of the datasets to be unlinked appears to be not linked at all,
-            # the neo4j cypher will simply skip it because it won't match the "MATCH" clause
-            # So no need to tell the end users that this dataset is not linked
-            # Let alone checking the entity type to ensure it's a Dataset
-            for dataset_uuid in json_data_dict['dataset_uuids_to_unlink']:
-                dataset_dict = query_target_entity(dataset_uuid, user_token)
-
         # Generate 'before_update_trigger' data and update the entity details in Neo4j
         merged_updated_dict = update_entity_details(request, normalized_entity_type, user_token, json_data_dict, entity_dict)
 

diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py
@@ -686,7 +686,7 @@ def get_dataset_revision_number(neo4j_driver, uuid):
 ----------
 neo4j_driver : neo4j.Driver object
     The neo4j database connection pool
-uuid : str
+dataset_uuid : str
     The uuid of the target entity: Dataset
 """
 def get_associated_organs_from_dataset(neo4j_driver, dataset_uuid):
@@ -708,6 +708,17 @@ def get_associated_organs_from_dataset(neo4j_driver, dataset_uuid):
 
     return results
 
+
+"""
+Retrieve the list of uuids for samples associated with a given dataset
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+dataset_uuid : str
+    The uuid of the target entity: Dataset
+"""
 def get_associated_samples_from_dataset(neo4j_driver, dataset_uuid):
     results = []
 
@@ -727,6 +738,17 @@ def get_associated_samples_from_dataset(neo4j_driver, dataset_uuid):
 
     return results
 
+
+"""
+Retrieve the list of uuids for donors associated with a given dataset
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+dataset_uuid : str
+    The uuid of the target entity: Dataset
+"""
 def get_associated_donors_from_dataset(neo4j_driver, dataset_uuid):
     results = []
 
@@ -1072,14 +1094,29 @@ def get_tuplets(neo4j_driver, uuid, status, prop_key):
                 results = schema_neo4j_queries.nodes_to_dicts(record[record_field_name])
     return results
 
-# Verify all UUIDs in a list are found as Neo4j node identifiers.
-# Return True if all list entries are found, and raise an exception if one or more
-# entries are not found when expected to be in the Neo4j graph.
+
+"""
+Verify all UUIDs in a list are found as Neo4j node identifiers.
+Return 
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+uuids : list
+    The uuids list
+
+Returns
+-------
+bool
+    True if all list entries are found, and raise an exception if one or more 
+    entries are not found when expected to be in the Neo4j graph.
+"""
 def uuids_all_exist(neo4j_driver, uuids:list):
     expected_match_count = len(uuids)
 
     record_field_name = 'match_count'
-    query = (f"MATCH(e: Entity) WHERE e.uuid IN {uuids} RETURN COUNT(e) AS {record_field_name}")
+    query = (f"MATCH (e:Entity) WHERE e.uuid IN {uuids} RETURN COUNT(e) AS {record_field_name}")
 
     with neo4j_driver.session() as session:
         record = session.read_transaction( schema_neo4j_queries.execute_readonly_tx
@@ -1096,28 +1133,32 @@ def uuids_all_exist(neo4j_driver, uuids:list):
                     f" exist as node identifiers in the Neo4j graph.")
 
 
+"""
+Get the entities from the neo4j database with the given uuids.
+
+Parameters
+----------
+uuids : Union[str, Iterable]
+    The uuid(s) of the entities to get.
+fields : Union[dict, Iterable, None], optional
+    The fields to return for each entity. If None, all fields are returned.
+    If a dict, the keys are the database fields to return and the values are the names to return them as.
+    If an iterable, the fields to return. Defaults to None.
+
+Returns
+-------
+Optional[List[neo4j.Record]]:
+    The entity records with the given uuids, or None if no datasets were found.
+    The specified fields are returned for each entity.
+Raises
+------
+ValueError
+    If fields is not a dict, an iterable, or None.
+"""
 def get_entities_by_uuid(neo4j_driver,
                          uuids: Union[str, Iterable],
                          fields: Union[dict, Iterable, None] = None) -> Optional[list]:
-    """Get the entities from the neo4j database with the given uuids.
-    Parameters
-    ----------
-    uuids : Union[str, Iterable]
-        The uuid(s) of the entities to get.
-    fields : Union[dict, Iterable, None], optional
-        The fields to return for each entity. If None, all fields are returned.
-        If a dict, the keys are the database fields to return and the values are the names to return them as.
-        If an iterable, the fields to return. Defaults to None.
-    Returns
-    -------
-    Optional[List[neo4j.Record]]:
-        The entity records with the given uuids, or None if no datasets were found.
-        The specified fields are returned for each entity.
-    Raises
-    ------
-    ValueError
-        If fields is not a dict, an iterable, or None.
-    """
+
     if isinstance(uuids, str):
         uuids = [uuids]
     if not isinstance(uuids, list):

diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -1248,6 +1248,7 @@ ENTITIES:
         type: list 
         before_property_update_validators:
           - validate_no_duplicates_in_list
+          - validate_ids_exist_and_datasets
         generated: true # Disallow user input from request json when being created
         indexed: false
         transient: true
@@ -1260,6 +1261,7 @@ ENTITIES:
         type: list 
         before_property_update_validators:
           - validate_no_duplicates_in_list
+          - validate_ids_exist_and_datasets
         generated: true # Disallow user input from request json when being created
         indexed: false
         transient: true

diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py
@@ -1408,6 +1408,39 @@ def get_upload_datasets(neo4j_driver, uuid, property_key = None):
     return results
 
 
+"""
+Get the qualified uuids-found and Dataset-given a list of uuids for validation purposes
+
+Parameters
+----------
+neo4j_driver : neo4j.Driver object
+    The neo4j database connection pool
+uuids : list
+    The list of uuids from user input
+
+Returns
+-------
+list
+    A list of uuids that are found and Dataset type
+
+"""
+def get_found_dataset_uuids(neo4j_driver, uuids):
+    query = (
+        f"MATCH (e:Dataset) "
+        f"WHERE e.uuid IN {uuids} "
+        f"RETURN COLLECT(e.uuid) AS {record_field_name}")
+
+    logger.info("======get_not_found_or_not_dataset_uuids() query======")
+    logger.debug(query)
+
+    with neo4j_driver.session() as session:
+        record = session.read_transaction(execute_readonly_tx, query)
+
+        uuids_list = record[record_field_name]
+
+        return uuids_list               
+
+
 """
 Get count of published Dataset in the provenance hierarchy for a given Sample/Donor
 

diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py
@@ -148,6 +148,33 @@ def validate_no_duplicates_in_list(property_key, normalized_entity_type, request
     if len(set(target_list)) != len(target_list):
         raise ValueError(f"The {property_key} field must only contain unique items")
 
+
+"""
+Validate all the provided uuids exist and all are Datasets when updating the target Upload
+
+Parameters
+----------
+property_key : str
+    The target property key
+normalized_type : str
+    Submission
+request: Flask request object
+    The instance of Flask request passed in from application request
+existing_data_dict : dict
+    A dictionary that contains all existing entity properties
+new_data_dict : dict
+    The json data in request body, already after the regular validations
+"""
+def validate_ids_exist_and_datasets(property_key, normalized_entity_type, request, existing_data_dict, new_data_dict):
+    neo4j_driver_instance = schema_manager.get_neo4j_driver_instance()
+    all_uuids_list = new_data_dict[property_key]
+    qualified_uuids_list = schema_neo4j_queries.get_found_dataset_uuids(neo4j_driver_instance, all_uuids_list)
+    unqualified_uuids_list = [item for item in all_uuids_list if item not in qualified_uuids_list]
+
+    if unqualified_uuids_list:
+        raise ValueError(f"The following {len(unqualified_uuids_list)} uuids are either not found or not Dataset type: {str(unqualified_uuids_list)}.")
+
+
 """
 Validate that a given dataset is not a component of a multi-assay split parent dataset fore allowing status to be 
 updated. If a component dataset needs to be updated, update it via its parent multi-assay dataset