From 36b6eca48ce3199fa9dbf70d586137e4f10d6439 Mon Sep 17 00:00:00 2001
From: yuanzhou <zhy19@pitt.edu>
Date: Thu, 2 Oct 2025 17:32:23 -0400
Subject: [PATCH 1/4] Improved validation

---
 src/app.py                    |  8 ++++----
 src/schema/schema_manager.py  | 15 +++++++--------
 src/schema/schema_triggers.py |  2 +-
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/src/app.py b/src/app.py
index 22013627..ead7b399 100644
--- a/src/app.py
+++ b/src/app.py
@@ -797,7 +797,7 @@ def get_entity_by_id(id):
         # First make sure the user provided query string params are valid
         for param in request.args:
             if param not in supported_qs_params:
-                bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")
+                bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")
 
         # Return a single property key and value using ?property=<property_key>
         if 'property' in request.args:
@@ -811,11 +811,11 @@ def get_entity_by_id(id):
 
             # Validate the target property
             if single_property_key not in supported_property_keys:
-                bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}")
+                bad_request_error(f"Only the following property keys are supported in the query parameter: {COMMA_SEPARATOR.join(supported_property_keys)}")
 
             if single_property_key == 'status' and \
                     not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
-                bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")
+                bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query parameter")
 
             # Response with the property value directly
             # Don't use jsonify() on string value
@@ -893,7 +893,7 @@ def get_entity_by_id(id):
     # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level.
     # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by 
     # `exclude_properties_from_response()`  - Zhou 10/1/2025
-    final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result)
+    final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_properties(neo4j_nested_properties_to_skip), final_result)
 
     # Response with the dict
     if public_entity and not user_in_hubmap_read_group(request):
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
index 724989cf..9cc4d044 100644
--- a/src/schema/schema_manager.py
+++ b/src/schema/schema_manager.py
@@ -374,23 +374,22 @@ def get_all_fields_to_exclude_from_query_string(request):
         properties_to_exclude_str = request.args.get('exclude')
         
         if properties_to_exclude_str:
-            # Must all lowercase values
-            has_upper = any(c.isupper() for c in properties_to_exclude_str)
-            
-            if has_upper:
-                raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.")
+            # Must contain only lowercase letters or underscores
+            # This check is faster, very readable, and handles the case efficiently
+            if not (all(c.islower() or c == '_' for c in properties_to_exclude_str) and any(c.isalpha() for c in properties_to_exclude_str)):
+                raise ValueError("Properties specified in 'exclude' query parameter must be either only lowercase letters or lowercase letters with underscores.")
 
             all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")]
 
             logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
         else:
-            raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).")
+            raise ValueError("The value of the 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).")
 
         # A bit more validation to limit to depth 2
         for item in all_properties_to_exclude:
             if '.' in item:
                 if len(item.split('.')) > 2:
-                    raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.")
+                    raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.")
         
         # More validation - ensure prohibited properties are not accepted
         # This two properties are required internally by `normalize_entity_result_for_response()`
@@ -425,7 +424,7 @@ def get_all_fields_to_exclude_from_query_string(request):
 list
     A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}]
 """
-def group_dot_notation_fields(flat_list):
+def group_dot_notation_properties(flat_list):
     output_list = []
     grouped_dict = {}
 
diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py
index 87211de4..9e9de756 100644
--- a/src/schema/schema_triggers.py
+++ b/src/schema/schema_triggers.py
@@ -2733,7 +2733,7 @@ def _get_neo4j_properties_to_exclude(property_key, request):
     # Find the specific sub list, depth is limited to 2
     # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
     # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
-    grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude)
+    grouped_fields = schema_manager.group_dot_notation_properties(all_properties_to_exclude)
 
     for item in grouped_fields:
         # Find the depth 2 properties (top-level to this triggered entity)

From e6ec05f8f0c32b78179e77f8f3b100263910c4d0 Mon Sep 17 00:00:00 2001
From: yuanzhou <zhy19@pitt.edu>
Date: Thu, 2 Oct 2025 18:01:37 -0400
Subject: [PATCH 2/4] A bit cleanup

---
 src/app.py                    |  8 ++++----
 src/schema/schema_manager.py  |  2 +-
 src/schema/schema_triggers.py | 24 ------------------------
 3 files changed, 5 insertions(+), 29 deletions(-)

diff --git a/src/app.py b/src/app.py
index ead7b399..f1e92f28 100644
--- a/src/app.py
+++ b/src/app.py
@@ -783,7 +783,7 @@ def get_entity_by_id(id):
     # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance)
     # When both 'property' and 'exclude' are specified in the URL, 'property' dominates
     # since the final result is a single field value - Zhou 10/1/2025
-    supported_qs_params = ['property', 'exclude']
+    supported_query_params = ['property', 'exclude']
 
     # There are three types of properties that can be excluded from the GET response
     # - top-level properties generated by trigger methods
@@ -794,10 +794,10 @@ def get_entity_by_id(id):
     neo4j_nested_properties_to_skip = []
 
     if bool(request.args):
-        # First make sure the user provided query string params are valid
+        # First make sure the user provided query params are valid
         for param in request.args:
-            if param not in supported_qs_params:
-                bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")
+            if param not in supported_query_params:
+                bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_query_params)}")
 
         # Return a single property key and value using ?property=<property_key>
         if 'property' in request.args:
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
index 9cc4d044..3325c775 100644
--- a/src/schema/schema_manager.py
+++ b/src/schema/schema_manager.py
@@ -383,7 +383,7 @@ def get_all_fields_to_exclude_from_query_string(request):
 
             logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
         else:
-            raise ValueError("The value of the 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).")
+            raise ValueError("Properties specified in 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).")
 
         # A bit more validation to limit to depth 2
         for item in all_properties_to_exclude:
diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py
index 9e9de756..188ac8ca 100644
--- a/src/schema/schema_triggers.py
+++ b/src/schema/schema_triggers.py
@@ -709,10 +709,6 @@ def get_collection_datasets(property_key, normalized_type, request, user_token,
 
     logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}")
     
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude)
@@ -787,10 +783,6 @@ def get_dataset_collections(property_key, normalized_type, request, user_token,
 
     logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
@@ -871,10 +863,6 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist
 
     logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude)
@@ -1005,10 +993,6 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to
 
     logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
@@ -2027,10 +2011,6 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke
     
     logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
@@ -2300,10 +2280,6 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis
 
     logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    # Get all the user specified fields either top-level or nested from the original query string in request URL
-    # Find the specific sub list, depth is limited to 2
-    # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
-    # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
     neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
 
     datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)

From d38d5ae89733da324a07ab02905a02173161fc69 Mon Sep 17 00:00:00 2001
From: yuanzhou <zhy19@pitt.edu>
Date: Thu, 2 Oct 2025 19:58:13 -0400
Subject: [PATCH 3/4] Improved validation

---
 src/app.py                    |  22 +++----
 src/schema/schema_manager.py  | 116 ++++++++++++++++++++++++----------
 src/schema/schema_triggers.py |  38 +++++------
 3 files changed, 111 insertions(+), 65 deletions(-)

diff --git a/src/app.py b/src/app.py
index f1e92f28..dcc4042e 100644
--- a/src/app.py
+++ b/src/app.py
@@ -789,9 +789,9 @@ def get_entity_by_id(id):
     # - top-level properties generated by trigger methods
     # - top-level properties returned as part of Neo4j node properties
     # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query
-    triggered_top_properties_to_skip = []
-    neo4j_top_properties_to_skip = []
-    neo4j_nested_properties_to_skip = []
+    triggered_top_props_to_skip = []
+    neo4j_top_props_to_skip = []
+    neo4j_nested_props_to_skip = []
 
     if bool(request.args):
         # First make sure the user provided query params are valid
@@ -830,11 +830,11 @@ def get_entity_by_id(id):
         # rather than within it. However, it leverages the existing `exclude_properties_from_response()` 
         # function for simplicity and maintainability. - Zhou 10/1/2025
         try:
-            all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request)
+            all_props_to_exclude = schema_manager.get_excluded_query_props(request)
             
             # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict`
             # Also get nested properties that are directly returned from Neo4j, which will be handled differently
-            triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude)
+            triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip = schema_manager.get_exclusion_types(normalized_entity_type, all_props_to_exclude)
         except ValueError as e:
             bad_request_error(e)
         except Exception as e:
@@ -842,9 +842,9 @@ def get_entity_by_id(id):
 
     # Get the generated complete entity result from cache if exists
     # Otherwise re-generate on the fly
-    # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods
+    # NOTE: top-level properties in `triggered_top_props_to_skip` will skip the trigger methods
     # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025
-    complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip)
+    complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_props_to_skip)
 
     # Determine if the entity is publicly visible base on its data, only.
     # To verify if a Collection is public, it is necessary to have its Datasets, which
@@ -880,9 +880,9 @@ def get_entity_by_id(id):
                         f"  A Globus token with access permission is required.")
 
     # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict`
-    # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip`
+    # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_props_to_skip`
     # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025
-    for item in neo4j_top_properties_to_skip:
+    for item in neo4j_top_props_to_skip:
         complete_dict.pop(item)
 
     # Also normalize the result based on schema
@@ -891,9 +891,9 @@ def get_entity_by_id(id):
     # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) 
     # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list`
     # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level.
-    # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by 
+    # NOTE: need to convert the `neo4j_nested_props_to_skip` to a format that can be used by 
     # `exclude_properties_from_response()`  - Zhou 10/1/2025
-    final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_properties(neo4j_nested_properties_to_skip), final_result)
+    final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_props(neo4j_nested_props_to_skip), final_result)
 
     # Response with the dict
     if public_entity and not user_in_hubmap_read_group(request):
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
index 3325c775..17441a24 100644
--- a/src/schema/schema_manager.py
+++ b/src/schema/schema_manager.py
@@ -366,27 +366,28 @@ def delete_nested_field(data, nested_path):
     A flat list of strings containing top-level and/or nested dot-notated properties
     Example: ['a.b', 'a.c', 'x']
 """
-def get_all_fields_to_exclude_from_query_string(request):
-    all_properties_to_exclude = []
+def get_excluded_query_props(request):
+    all_props_to_exclude = []
 
     if 'exclude' in request.args:
         # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive
-        properties_to_exclude_str = request.args.get('exclude')
-        
-        if properties_to_exclude_str:
-            # Must contain only lowercase letters or underscores
-            # This check is faster, very readable, and handles the case efficiently
-            if not (all(c.islower() or c == '_' for c in properties_to_exclude_str) and any(c.isalpha() for c in properties_to_exclude_str)):
-                raise ValueError("Properties specified in 'exclude' query parameter must be either only lowercase letters or lowercase letters with underscores.")
+        props_to_exclude_str = request.args.get('exclude')
 
-            all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")]
+        if not validate_comma_separated_exclude_str(props_to_exclude_str):
+            raise ValueError(
+                "Properties specified in 'exclude' query parameter must meet the following rules: "
+                "1) Cannot be empty "
+                "2) The string must contain at least one lowercase letter "
+                "3) May include underscores (_) "
+                "4) Have no more than one dot (.) that cannot start or end the string"
+            )
 
-            logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
-        else:
-            raise ValueError("Properties specified in 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).")
+        all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")]
+
+        logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}")
 
         # A bit more validation to limit to depth 2
-        for item in all_properties_to_exclude:
+        for item in all_props_to_exclude:
             if '.' in item:
                 if len(item.split('.')) > 2:
                     raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.")
@@ -396,11 +397,56 @@ def get_all_fields_to_exclude_from_query_string(request):
         prohibited_properties = ['uuid', 'entity_type']
         second_level_list = []
 
-        for item in all_properties_to_exclude:
+        for item in all_props_to_exclude:
             if item in prohibited_properties or ('.' in item and item.split('.')[1] in prohibited_properties):
                 raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.")
 
-    return all_properties_to_exclude
+    return all_props_to_exclude
+
+
+"""
+Validate if the value string of 'exclude' query parameter meets the following rules:
+
+- Only contains:
+    - At least one lowercase letter (a–z)
+    - 0 or more underscores (_)
+    - No more than 1 dot (.)
+- Can NOT:
+    - Be an empty string
+    - Start or end with a dot (.)
+    - Have multiple consecutive dots (..)
+
+Parameters
+----------
+s : str
+    Comma-separated input string used to exclude entity properties
+
+Returns
+-------
+bool
+    True if valid or False otherwise
+"""
+def validate_comma_separated_exclude_str(s: str) -> bool:
+    # No empty string
+    if not s:
+        return False
+    
+    # Split by commas
+    items = s.split(',')
+    
+    # No empty items allowed (prevents ',,' or trailing comma)
+    if any(not item.strip() for item in items):
+        return False
+    
+    def is_valid_item(item: str) -> bool:
+        return (
+            all(c.islower() or c in '._' for c in item)  # allowed chars
+            and any(c.isalpha() for c in item)  # at least one letter
+            and item.count('.') <= 1  # max one dot
+            and not (item.startswith('.') or item.endswith('.'))  # no dot at ends
+        )
+    
+    return all(is_valid_item(item.strip()) for item in items)
 
 
 """
@@ -424,7 +470,7 @@ def get_all_fields_to_exclude_from_query_string(request):
 list
     A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}]
 """
-def group_dot_notation_properties(flat_list):
+def group_dot_notation_props(flat_list):
     output_list = []
     grouped_dict = {}
 
@@ -465,16 +511,16 @@ def group_dot_notation_properties(flat_list):
     Three lists - one for triggered properties and one for Neo4j node properties
     
     Example for Dataset:
-    - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
-    - neo4j_top_properties_to_skip: ['data_access_level']
-    - neo4j_nested_properties_to_skip: ['status_history.status']
+    - triggered_top_props_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
+    - neo4j_top_props_to_skip: ['data_access_level']
+    - neo4j_nested_props_to_skip: ['status_history.status']
 """
-def determine_property_exclusion_type(normalized_entity_type, flat_list):
+def get_exclusion_types(normalized_entity_type, flat_list):
     global _schema
 
-    triggered_top_properties_to_skip = []
-    neo4j_top_properties_to_skip = []
-    neo4j_nested_properties_to_skip =[]
+    triggered_top_props_to_skip = []
+    neo4j_top_props_to_skip = []
+    neo4j_nested_props_to_skip =[]
     top_level_list = []
     second_level_list = []
     properties = _schema['ENTITIES'][normalized_entity_type]['properties']
@@ -490,27 +536,27 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list):
     for item in top_level_list:
         if item in properties:
             if TriggerTypeEnum.ON_READ in properties[item]:
-                triggered_top_properties_to_skip.append(item)
+                triggered_top_props_to_skip.append(item)
             else:
-                neo4j_top_properties_to_skip.append(item)
+                neo4j_top_props_to_skip.append(item)
 
-    # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip`
-    # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip`
+    # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_props_to_skip`
+    # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_props_to_skip`
     for item in second_level_list:
         prefix = item.split('.')[0]
         if prefix in properties:
             if TriggerTypeEnum.ON_READ in properties[prefix]:
-                triggered_top_properties_to_skip.append(item)
+                triggered_top_props_to_skip.append(item)
             else:
-                neo4j_nested_properties_to_skip.append(item)
+                neo4j_nested_props_to_skip.append(item)
 
-    logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}")
-    logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}")
-    logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}")
+    logger.info(f"Determined property exclusion type - triggered_top_props_to_skip: {triggered_top_props_to_skip}")
+    logger.info(f"Determined property exclusion type - neo4j_top_props_to_skip: {neo4j_top_props_to_skip}")
+    logger.info(f"Determined property exclusion type - neo4j_nested_props_to_skip: {neo4j_nested_props_to_skip}")
 
-    # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by 
+    # NOTE: Will need to convert the `neo4j_nested_props_to_skip` to a format that can be used by 
     # `exclude_properties_from_response()`  - Zhou 10/1/2025
-    return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip
+    return triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip
 
 
 """
diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py
index 188ac8ca..91d8a112 100644
--- a/src/schema/schema_triggers.py
+++ b/src/schema/schema_triggers.py
@@ -709,9 +709,9 @@ def get_collection_datasets(property_key, normalized_type, request, user_token,
 
     logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}")
     
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude)
+    datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude)
 
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -783,9 +783,9 @@ def get_dataset_collections(property_key, normalized_type, request, user_token,
 
     logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
+    collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude)
 
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -863,9 +863,9 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist
 
     logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude)
+    upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude)
     
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -993,9 +993,9 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to
 
     logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
+    direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude)
 
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -2011,9 +2011,9 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke
     
     logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
+    direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude)
 
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -2280,9 +2280,9 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis
 
     logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}")
 
-    neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request)
+    neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request)
 
-    datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude)
+    datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude)
 
     # Get rid of the entity node properties that are not defined in the yaml schema
     # as well as the ones defined as `exposed: false` in the yaml schema
@@ -2695,12 +2695,12 @@ def _get_age_age_units_race_sex_phrase(age:str=None, age_units:str='units', race
 -------
 list: A list containing Neo4j node properties to exclude
 """
-def _get_neo4j_properties_to_exclude(property_key, request):
-    neo4j_properties_to_exclude = []
+def _get_excluded_neo4j_props(property_key, request):
+    neo4j_props_to_exclude = []
 
     # Get all the user specified fields either top-level or nested from the original query string in request URL
     try:
-        all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request)
+        all_props_to_exclude = schema_manager.get_excluded_query_props(request)
     except ValueError as e:
         raise ValueError(e)
     except Exception as e:
@@ -2709,7 +2709,7 @@ def _get_neo4j_properties_to_exclude(property_key, request):
     # Find the specific sub list, depth is limited to 2
     # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method
     # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025
-    grouped_fields = schema_manager.group_dot_notation_properties(all_properties_to_exclude)
+    grouped_fields = schema_manager.group_dot_notation_props(all_props_to_exclude)
 
     for item in grouped_fields:
         # Find the depth 2 properties (top-level to this triggered entity)
@@ -2718,13 +2718,13 @@ def _get_neo4j_properties_to_exclude(property_key, request):
                 if not isinstance(field, str):
                     item[property_key].pop(field)
 
-            neo4j_properties_to_exclude = item[property_key]
+            neo4j_props_to_exclude = item[property_key]
             
-            logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}")
+            logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_props_to_exclude}")
             
             # Stop after finding the first match
             break
 
-    return neo4j_properties_to_exclude
+    return neo4j_props_to_exclude
 
 

From 40ea183880abe6fd8db751156cc51c6f6edfa200 Mon Sep 17 00:00:00 2001
From: yuanzhou <zhy19@pitt.edu>
Date: Thu, 2 Oct 2025 22:01:45 -0400
Subject: [PATCH 4/4] Improved validation and fix exclude bug in
 exclude_properties_from_response()

---
 src/schema/schema_manager.py | 42 +++++++++++++-----------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
index 17441a24..07783d5d 100644
--- a/src/schema/schema_manager.py
+++ b/src/schema/schema_manager.py
@@ -331,7 +331,7 @@ def delete_nested_field(data, nested_path):
                                 for item in data[key]:
                                     if nested_field in item:
                                         del item[nested_field]
-                            elif nested_field in data[key]:
+                            elif isinstance(data[key], dict) and nested_field in data[key]:
                                 del data[key][nested_field]
                     elif isinstance(value, dict):
                         delete_nested_field(data[key], value)
@@ -375,23 +375,16 @@ def get_excluded_query_props(request):
 
         if not validate_comma_separated_exclude_str(props_to_exclude_str):
             raise ValueError(
-                "Properties specified in 'exclude' query parameter must meet the following rules: "
-                "1) Cannot be empty "
-                "2) The string must contain at least one lowercase letter "
-                "3) May include underscores (_) "
-                "4) Have no more than one dot (.) that cannot start or end the string"
+                "The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: "
+                "[1] Each property must include at least one letter; "
+                "[2] Only lowercase letters and underscores '_' are allowed; "
+                "[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b')."
             )
 
         all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")]
 
         logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}")
 
-        # A bit more validation to limit to depth 2
-        for item in all_props_to_exclude:
-            if '.' in item:
-                if len(item.split('.')) > 2:
-                    raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.")
-        
         # More validation - ensure prohibited properties are not accepted
         # This two properties are required internally by `normalize_entity_result_for_response()`
         prohibited_properties = ['uuid', 'entity_type']
@@ -405,16 +398,11 @@ def get_excluded_query_props(request):
 
 
 """
-Validate if the value string of 'exclude' query parameter meets the following rules:
+The 'exclude' query parameter must be a comma-separated list of properties that follow these rules:
 
-- Only contains:
-    - At least one lowercase letter (a–z)
-    - 0 or more underscores (_)
-    - No more than 1 dot (.)
-- Can NOT:
-    - Be an empty string
-    - Start or end with a dot (.)
-    - Have multiple consecutive dots (..)
+[1] Each property must include at least one letter;
+[2] Only lowercase letters and underscores '_' are allowed;
+[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b').
 
 Parameters
 ----------
@@ -426,7 +414,7 @@ def get_excluded_query_props(request):
 bool
     True if valid or False otherwise
 """
-def validate_comma_separated_exclude_str(s: str) -> bool:
+def validate_comma_separated_exclude_str(s: str):
     # No empty string
     if not s:
         return False
@@ -438,12 +426,12 @@ def validate_comma_separated_exclude_str(s: str) -> bool:
     if any(not item.strip() for item in items):
         return False
     
-    def is_valid_item(item: str) -> bool:
+    def is_valid_item(item: str):
         return (
-            all(c.islower() or c in '._' for c in item)  # allowed chars
-            and any(c.isalpha() for c in item)  # at least one letter
-            and item.count('.') <= 1  # max one dot
-            and not (item.startswith('.') or item.endswith('.'))  # no dot at ends
+            all(c.islower() or c in '._' for c in item)
+            and any(c.isalpha() for c in item)
+            and item.count('.') <= 1
+            and not ((item.startswith('.') or item.endswith('.')))
         )
     
     return all(is_valid_item(item.strip()) for item in items)