From 36b6eca48ce3199fa9dbf70d586137e4f10d6439 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 17:32:23 -0400 Subject: [PATCH 1/4] Improved validation --- src/app.py | 8 ++++---- src/schema/schema_manager.py | 15 +++++++-------- src/schema/schema_triggers.py | 2 +- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/app.py b/src/app.py index 22013627..ead7b399 100644 --- a/src/app.py +++ b/src/app.py @@ -797,7 +797,7 @@ def get_entity_by_id(id): # First make sure the user provided query string params are valid for param in request.args: if param not in supported_qs_params: - bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") + bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") # Return a single property key and value using ?property= if 'property' in request.args: @@ -811,11 +811,11 @@ def get_entity_by_id(id): # Validate the target property if single_property_key not in supported_property_keys: - bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}") + bad_request_error(f"Only the following property keys are supported in the query parameter: {COMMA_SEPARATOR.join(supported_property_keys)}") if single_property_key == 'status' and \ not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): - bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") + bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query parameter") # Response with the property value directly # Don't use jsonify() on string value @@ -893,7 +893,7 @@ def get_entity_by_id(id): # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level. # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by # `exclude_properties_from_response()` - Zhou 10/1/2025 - final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result) + final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_properties(neo4j_nested_properties_to_skip), final_result) # Response with the dict if public_entity and not user_in_hubmap_read_group(request): diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 724989cf..9cc4d044 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -374,23 +374,22 @@ def get_all_fields_to_exclude_from_query_string(request): properties_to_exclude_str = request.args.get('exclude') if properties_to_exclude_str: - # Must all lowercase values - has_upper = any(c.isupper() for c in properties_to_exclude_str) - - if has_upper: - raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.") + # Must contain only lowercase letters or underscores + # This check is faster, very readable, and handles the case efficiently + if not (all(c.islower() or c == '_' for c in properties_to_exclude_str) and any(c.isalpha() for c in properties_to_exclude_str)): + raise ValueError("Properties specified in 'exclude' query parameter must be either only lowercase letters or lowercase letters with underscores.") all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") else: - raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") + raise ValueError("The value of the 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).") # A bit more validation to limit to depth 2 for item in all_properties_to_exclude: if '.' in item: if len(item.split('.')) > 2: - raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") + raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.") # More validation - ensure prohibited properties are not accepted # This two properties are required internally by `normalize_entity_result_for_response()` @@ -425,7 +424,7 @@ def get_all_fields_to_exclude_from_query_string(request): list A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}] """ -def group_dot_notation_fields(flat_list): +def group_dot_notation_properties(flat_list): output_list = [] grouped_dict = {} diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 87211de4..9e9de756 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -2733,7 +2733,7 @@ def _get_neo4j_properties_to_exclude(property_key, request): # Find the specific sub list, depth is limited to 2 # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) + grouped_fields = schema_manager.group_dot_notation_properties(all_properties_to_exclude) for item in grouped_fields: # Find the depth 2 properties (top-level to this triggered entity) From e6ec05f8f0c32b78179e77f8f3b100263910c4d0 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 18:01:37 -0400 Subject: [PATCH 2/4] A bit cleanup --- src/app.py | 8 ++++---- src/schema/schema_manager.py | 2 +- src/schema/schema_triggers.py | 24 ------------------------ 3 files changed, 5 insertions(+), 29 deletions(-) diff --git a/src/app.py b/src/app.py index ead7b399..f1e92f28 100644 --- a/src/app.py +++ b/src/app.py @@ -783,7 +783,7 @@ def get_entity_by_id(id): # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance) # When both 'property' and 'exclude' are specified in the URL, 'property' dominates # since the final result is a single field value - Zhou 10/1/2025 - supported_qs_params = ['property', 'exclude'] + supported_query_params = ['property', 'exclude'] # There are three types of properties that can be excluded from the GET response # - top-level properties generated by trigger methods @@ -794,10 +794,10 @@ def get_entity_by_id(id): neo4j_nested_properties_to_skip = [] if bool(request.args): - # First make sure the user provided query string params are valid + # First make sure the user provided query params are valid for param in request.args: - if param not in supported_qs_params: - bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") + if param not in supported_query_params: + bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_query_params)}") # Return a single property key and value using ?property= if 'property' in request.args: diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 9cc4d044..3325c775 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -383,7 +383,7 @@ def get_all_fields_to_exclude_from_query_string(request): logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") else: - raise ValueError("The value of the 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).") + raise ValueError("Properties specified in 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).") # A bit more validation to limit to depth 2 for item in all_properties_to_exclude: diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 9e9de756..188ac8ca 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -709,10 +709,6 @@ def get_collection_datasets(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) @@ -787,10 +783,6 @@ def get_dataset_collections(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) @@ -871,10 +863,6 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) @@ -1005,10 +993,6 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) @@ -2027,10 +2011,6 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) @@ -2300,10 +2280,6 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) From d38d5ae89733da324a07ab02905a02173161fc69 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 19:58:13 -0400 Subject: [PATCH 3/4] Improved validation --- src/app.py | 22 +++---- src/schema/schema_manager.py | 116 ++++++++++++++++++++++++---------- src/schema/schema_triggers.py | 38 +++++------ 3 files changed, 111 insertions(+), 65 deletions(-) diff --git a/src/app.py b/src/app.py index f1e92f28..dcc4042e 100644 --- a/src/app.py +++ b/src/app.py @@ -789,9 +789,9 @@ def get_entity_by_id(id): # - top-level properties generated by trigger methods # - top-level properties returned as part of Neo4j node properties # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query - triggered_top_properties_to_skip = [] - neo4j_top_properties_to_skip = [] - neo4j_nested_properties_to_skip = [] + triggered_top_props_to_skip = [] + neo4j_top_props_to_skip = [] + neo4j_nested_props_to_skip = [] if bool(request.args): # First make sure the user provided query params are valid @@ -830,11 +830,11 @@ def get_entity_by_id(id): # rather than within it. However, it leverages the existing `exclude_properties_from_response()` # function for simplicity and maintainability. - Zhou 10/1/2025 try: - all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + all_props_to_exclude = schema_manager.get_excluded_query_props(request) # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` # Also get nested properties that are directly returned from Neo4j, which will be handled differently - triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude) + triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip = schema_manager.get_exclusion_types(normalized_entity_type, all_props_to_exclude) except ValueError as e: bad_request_error(e) except Exception as e: @@ -842,9 +842,9 @@ def get_entity_by_id(id): # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly - # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods + # NOTE: top-level properties in `triggered_top_props_to_skip` will skip the trigger methods # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025 - complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip) + complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_props_to_skip) # Determine if the entity is publicly visible base on its data, only. # To verify if a Collection is public, it is necessary to have its Datasets, which @@ -880,9 +880,9 @@ def get_entity_by_id(id): f" A Globus token with access permission is required.") # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict` - # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip` + # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_props_to_skip` # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025 - for item in neo4j_top_properties_to_skip: + for item in neo4j_top_props_to_skip: complete_dict.pop(item) # Also normalize the result based on schema @@ -891,9 +891,9 @@ def get_entity_by_id(id): # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list` # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level. - # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # NOTE: need to convert the `neo4j_nested_props_to_skip` to a format that can be used by # `exclude_properties_from_response()` - Zhou 10/1/2025 - final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_properties(neo4j_nested_properties_to_skip), final_result) + final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_props(neo4j_nested_props_to_skip), final_result) # Response with the dict if public_entity and not user_in_hubmap_read_group(request): diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 3325c775..17441a24 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -366,27 +366,28 @@ def delete_nested_field(data, nested_path): A flat list of strings containing top-level and/or nested dot-notated properties Example: ['a.b', 'a.c', 'x'] """ -def get_all_fields_to_exclude_from_query_string(request): - all_properties_to_exclude = [] +def get_excluded_query_props(request): + all_props_to_exclude = [] if 'exclude' in request.args: # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive - properties_to_exclude_str = request.args.get('exclude') - - if properties_to_exclude_str: - # Must contain only lowercase letters or underscores - # This check is faster, very readable, and handles the case efficiently - if not (all(c.islower() or c == '_' for c in properties_to_exclude_str) and any(c.isalpha() for c in properties_to_exclude_str)): - raise ValueError("Properties specified in 'exclude' query parameter must be either only lowercase letters or lowercase letters with underscores.") + props_to_exclude_str = request.args.get('exclude') - all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] + if not validate_comma_separated_exclude_str(props_to_exclude_str): + raise ValueError( + "Properties specified in 'exclude' query parameter must meet the following rules: " + "1) Cannot be empty " + "2) The string must contain at least one lowercase letter " + "3) May include underscores (_) " + "4) Have no more than one dot (.) that cannot start or end the string" + ) - logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") - else: - raise ValueError("Properties specified in 'exclude' query parameter can not be empty and must be similar to the form of 'a, b, c, d.e, f_g ...' (case-sensitive).") + all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")] + + logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}") # A bit more validation to limit to depth 2 - for item in all_properties_to_exclude: + for item in all_props_to_exclude: if '.' in item: if len(item.split('.')) > 2: raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.") @@ -396,11 +397,56 @@ def get_all_fields_to_exclude_from_query_string(request): prohibited_properties = ['uuid', 'entity_type'] second_level_list = [] - for item in all_properties_to_exclude: + for item in all_props_to_exclude: if item in prohibited_properties or ('.' in item and item.split('.')[1] in prohibited_properties): raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.") - return all_properties_to_exclude + return all_props_to_exclude + + +""" +Validate if the value string of 'exclude' query parameter meets the following rules: + +- Only contains: + - At least one lowercase letter (a–z) + - 0 or more underscores (_) + - No more than 1 dot (.) +- Can NOT: + - Be an empty string + - Start or end with a dot (.) + - Have multiple consecutive dots (..) + +Parameters +---------- +s : str + Comma-separated input string used to exclude entity properties + +Returns +------- +bool + True if valid or False otherwise +""" +def validate_comma_separated_exclude_str(s: str) -> bool: + # No empty string + if not s: + return False + + # Split by commas + items = s.split(',') + + # No empty items allowed (prevents ',,' or trailing comma) + if any(not item.strip() for item in items): + return False + + def is_valid_item(item: str) -> bool: + return ( + all(c.islower() or c in '._' for c in item) # allowed chars + and any(c.isalpha() for c in item) # at least one letter + and item.count('.') <= 1 # max one dot + and not (item.startswith('.') or item.endswith('.')) # no dot at ends + ) + + return all(is_valid_item(item.strip()) for item in items) """ @@ -424,7 +470,7 @@ def get_all_fields_to_exclude_from_query_string(request): list A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}] """ -def group_dot_notation_properties(flat_list): +def group_dot_notation_props(flat_list): output_list = [] grouped_dict = {} @@ -465,16 +511,16 @@ def group_dot_notation_properties(flat_list): Three lists - one for triggered properties and one for Neo4j node properties Example for Dataset: - - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] - - neo4j_top_properties_to_skip: ['data_access_level'] - - neo4j_nested_properties_to_skip: ['status_history.status'] + - triggered_top_props_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] + - neo4j_top_props_to_skip: ['data_access_level'] + - neo4j_nested_props_to_skip: ['status_history.status'] """ -def determine_property_exclusion_type(normalized_entity_type, flat_list): +def get_exclusion_types(normalized_entity_type, flat_list): global _schema - triggered_top_properties_to_skip = [] - neo4j_top_properties_to_skip = [] - neo4j_nested_properties_to_skip =[] + triggered_top_props_to_skip = [] + neo4j_top_props_to_skip = [] + neo4j_nested_props_to_skip =[] top_level_list = [] second_level_list = [] properties = _schema['ENTITIES'][normalized_entity_type]['properties'] @@ -490,27 +536,27 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list): for item in top_level_list: if item in properties: if TriggerTypeEnum.ON_READ in properties[item]: - triggered_top_properties_to_skip.append(item) + triggered_top_props_to_skip.append(item) else: - neo4j_top_properties_to_skip.append(item) + neo4j_top_props_to_skip.append(item) - # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip` - # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip` + # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_props_to_skip` + # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_props_to_skip` for item in second_level_list: prefix = item.split('.')[0] if prefix in properties: if TriggerTypeEnum.ON_READ in properties[prefix]: - triggered_top_properties_to_skip.append(item) + triggered_top_props_to_skip.append(item) else: - neo4j_nested_properties_to_skip.append(item) + neo4j_nested_props_to_skip.append(item) - logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}") - logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}") - logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}") + logger.info(f"Determined property exclusion type - triggered_top_props_to_skip: {triggered_top_props_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_top_props_to_skip: {neo4j_top_props_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_nested_props_to_skip: {neo4j_nested_props_to_skip}") - # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # NOTE: Will need to convert the `neo4j_nested_props_to_skip` to a format that can be used by # `exclude_properties_from_response()` - Zhou 10/1/2025 - return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip + return triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip """ diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 188ac8ca..91d8a112 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -709,9 +709,9 @@ def get_collection_datasets(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) + datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -783,9 +783,9 @@ def get_dataset_collections(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -863,9 +863,9 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) + upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -993,9 +993,9 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2011,9 +2011,9 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2280,9 +2280,9 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2695,12 +2695,12 @@ def _get_age_age_units_race_sex_phrase(age:str=None, age_units:str='units', race ------- list: A list containing Neo4j node properties to exclude """ -def _get_neo4j_properties_to_exclude(property_key, request): - neo4j_properties_to_exclude = [] +def _get_excluded_neo4j_props(property_key, request): + neo4j_props_to_exclude = [] # Get all the user specified fields either top-level or nested from the original query string in request URL try: - all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + all_props_to_exclude = schema_manager.get_excluded_query_props(request) except ValueError as e: raise ValueError(e) except Exception as e: @@ -2709,7 +2709,7 @@ def _get_neo4j_properties_to_exclude(property_key, request): # Find the specific sub list, depth is limited to 2 # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - grouped_fields = schema_manager.group_dot_notation_properties(all_properties_to_exclude) + grouped_fields = schema_manager.group_dot_notation_props(all_props_to_exclude) for item in grouped_fields: # Find the depth 2 properties (top-level to this triggered entity) @@ -2718,13 +2718,13 @@ def _get_neo4j_properties_to_exclude(property_key, request): if not isinstance(field, str): item[property_key].pop(field) - neo4j_properties_to_exclude = item[property_key] + neo4j_props_to_exclude = item[property_key] - logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") + logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_props_to_exclude}") # Stop after finding the first match break - return neo4j_properties_to_exclude + return neo4j_props_to_exclude From 40ea183880abe6fd8db751156cc51c6f6edfa200 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 22:01:45 -0400 Subject: [PATCH 4/4] Improved validation and fix exclude bug in exclude_properties_from_response() --- src/schema/schema_manager.py | 42 +++++++++++++----------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 17441a24..07783d5d 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -331,7 +331,7 @@ def delete_nested_field(data, nested_path): for item in data[key]: if nested_field in item: del item[nested_field] - elif nested_field in data[key]: + elif isinstance(data[key], dict) and nested_field in data[key]: del data[key][nested_field] elif isinstance(value, dict): delete_nested_field(data[key], value) @@ -375,23 +375,16 @@ def get_excluded_query_props(request): if not validate_comma_separated_exclude_str(props_to_exclude_str): raise ValueError( - "Properties specified in 'exclude' query parameter must meet the following rules: " - "1) Cannot be empty " - "2) The string must contain at least one lowercase letter " - "3) May include underscores (_) " - "4) Have no more than one dot (.) that cannot start or end the string" + "The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: " + "[1] Each property must include at least one letter; " + "[2] Only lowercase letters and underscores '_' are allowed; " + "[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b')." ) all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")] logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}") - # A bit more validation to limit to depth 2 - for item in all_props_to_exclude: - if '.' in item: - if len(item.split('.')) > 2: - raise ValueError("When dot-notation properties are specified in 'exclude', only single dot-notation allowed (e.g., a.b). Properties with multiple dots like a.b.c are not supported.") - # More validation - ensure prohibited properties are not accepted # This two properties are required internally by `normalize_entity_result_for_response()` prohibited_properties = ['uuid', 'entity_type'] @@ -405,16 +398,11 @@ def get_excluded_query_props(request): """ -Validate if the value string of 'exclude' query parameter meets the following rules: +The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: -- Only contains: - - At least one lowercase letter (a–z) - - 0 or more underscores (_) - - No more than 1 dot (.) -- Can NOT: - - Be an empty string - - Start or end with a dot (.) - - Have multiple consecutive dots (..) +[1] Each property must include at least one letter; +[2] Only lowercase letters and underscores '_' are allowed; +[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b'). Parameters ---------- @@ -426,7 +414,7 @@ def get_excluded_query_props(request): bool True if valid or False otherwise """ -def validate_comma_separated_exclude_str(s: str) -> bool: +def validate_comma_separated_exclude_str(s: str): # No empty string if not s: return False @@ -438,12 +426,12 @@ def validate_comma_separated_exclude_str(s: str) -> bool: if any(not item.strip() for item in items): return False - def is_valid_item(item: str) -> bool: + def is_valid_item(item: str): return ( - all(c.islower() or c in '._' for c in item) # allowed chars - and any(c.isalpha() for c in item) # at least one letter - and item.count('.') <= 1 # max one dot - and not (item.startswith('.') or item.endswith('.')) # no dot at ends + all(c.islower() or c in '._' for c in item) + and any(c.isalpha() for c in item) + and item.count('.') <= 1 + and not ((item.startswith('.') or item.endswith('.'))) ) return all(is_valid_item(item.strip()) for item in items)