From e822ecf58b13fd225486d6ba0a97be6e54e211ec Mon Sep 17 00:00:00 2001 From: April Kwong Date: Sat, 7 Mar 2026 23:29:21 -0800 Subject: [PATCH 1/6] Fix AOAI evaluation to preserve list values instead of stringifying them The _convert_value helper in _get_data_source was converting list values to strings via str(), turning [] into '[]'. The AOAI API then rejected these with 'is not of type array' errors. Move list from the stringify branch to the pass-through branch alongside dict, since both are structured JSON types that should be preserved as native objects for proper serialization. Update existing test assertions and add a new test for list/dict value preservation including empty collections. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate_aoai.py | 4 +-- .../tests/unittests/test_aoai_data_source.py | 35 +++++++++++++++++-- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index a548fc529ab4..15fe34079ac5 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -816,9 +816,9 @@ def _convert_value(val: Any) -> Any: if isinstance(val, bool): return val # Align numerics with legacy text-only JSONL payloads by turning them into strings. - if isinstance(val, (int, float, list)): + if isinstance(val, (int, float)): return str(val) - if isinstance(val, (dict)): + if isinstance(val, (list, dict)): return val return str(val) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index 6d77e098eaba..186d651a2328 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -437,7 +437,7 @@ def test_data_source_with_item_column_and_nested_values(self, nested_item_keywor # Ensure we did not accidentally nest another 'item' key inside the wrapper assert "item" not in item_payload assert item_payload["sample"]["output_text"] == "someoutput" - assert item_payload["sample"]["output_items"] == "['item1', 'item2']" + assert item_payload["sample"]["output_items"] == ["item1", "item2"] def test_data_source_with_item_sample_column_and_nested_values(self, nested_item_sample_keyword_data): """Ensure rows that already have an 'item' column keep nested dicts intact.""" @@ -464,7 +464,7 @@ def test_data_source_with_item_sample_column_and_nested_values(self, nested_item # Ensure we did not accidentally nest another 'item' key inside the wrapper assert "item" not in item_payload assert item_payload["sample"]["output_text"] == "someoutput" - assert item_payload["sample"]["output_items"] == "['item1', 'item2']" + assert item_payload["sample"]["output_items"] == ["item1", "item2"] def test_data_source_with_sample_output_metadata(self, flat_sample_output_data): """Ensure flat rows that include dotted sample metadata remain accessible.""" @@ -485,7 +485,7 @@ def test_data_source_with_sample_output_metadata(self, flat_sample_output_data): assert row["test"]["test_string"] == "baking cakes is fun!" # sample.output_text should follow the row through normalization without being stringified assert row["sample.output_text"] == "someoutput" - assert row["sample.output_items"] == "['item1', 'item2']" + assert row["sample.output_items"] == ["item1", "item2"] def test_data_source_with_numeric_values(self, flat_test_data): """Test data source generation converts numeric values to strings.""" @@ -504,6 +504,35 @@ def test_data_source_with_numeric_values(self, flat_test_data): assert isinstance(content[0][WRAPPER_KEY]["score"], str) assert isinstance(content[0][WRAPPER_KEY]["confidence"], str) + def test_data_source_with_list_and_dict_values(self, flat_test_data): + """Test data source generation preserves list and dict values as-is.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + } + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # Lists should be preserved as lists, not stringified + assert content[0][WRAPPER_KEY]["tags"] == ["tag1", "tag2"] + assert isinstance(content[0][WRAPPER_KEY]["tags"], list) + # Empty lists should also be preserved + assert content[2][WRAPPER_KEY]["tags"] == [] + assert isinstance(content[2][WRAPPER_KEY]["tags"], list) + + # Dicts should be preserved as dicts + assert content[0][WRAPPER_KEY]["metadata"] == {"key": "val"} + assert isinstance(content[0][WRAPPER_KEY]["metadata"], dict) + # Empty dicts should also be preserved + assert content[2][WRAPPER_KEY]["metadata"] == {} + assert isinstance(content[2][WRAPPER_KEY]["metadata"], dict) + def test_empty_dataframe(self): """Test data source generation with empty dataframe.""" empty_df = pd.DataFrame() From 81aded89f0e664fc5c210d3707db1cac8c6dccae Mon Sep 17 00:00:00 2001 From: April Kwong Date: Sat, 7 Mar 2026 23:40:31 -0800 Subject: [PATCH 2/6] Infer array/object schema types for list/dict columns in flat mode The flat schema generator in _generate_data_source_config now samples the first row to emit the correct JSON Schema type (array, object, or string) instead of defaulting everything to string. This ensures the schema aligns with the data produced by _convert_value. Add test for schema type inference and an integration test verifying schema-data alignment for list/dict columns including empty collections. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate_aoai.py | 10 +++- .../tests/unittests/test_aoai_data_source.py | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index 15fe34079ac5..f65b0969027f 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -715,8 +715,14 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di props = data_source_config["item_schema"]["properties"] req = data_source_config["item_schema"]["required"] for key in column_mapping.keys(): - if key in input_data_df and len(input_data_df[key]) > 0 and isinstance(input_data_df[key].iloc[0], list): - props[key] = {"type": "array"} + if key in input_data_df and len(input_data_df[key]) > 0: + sample = input_data_df[key].iloc[0] + if isinstance(sample, list): + props[key] = {"type": "array"} + elif isinstance(sample, dict): + props[key] = {"type": "object"} + else: + props[key] = {"type": "string"} else: props[key] = {"type": "string"} req.append(key) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index 186d651a2328..b7baf9ce893c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -297,6 +297,31 @@ def test_single_nested_path(self, flat_test_data): # After wrapper stripping, should see context assert "context" in schema["properties"] + def test_flat_schema_infers_list_and_dict_types(self, flat_test_data): + """Test that flat schema correctly infers array/object types from data.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + flat_test_data["score"] = [95, 87, 92] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + "score": "${data.score}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + + properties = config["item_schema"]["properties"] + # Strings should be typed as string + assert properties["query"]["type"] == "string" + # Lists should be typed as array + assert properties["tags"]["type"] == "array" + # Dicts should be typed as object + assert properties["metadata"]["type"] == "object" + # Numerics (converted to str by _convert_value) should be typed as string + assert properties["score"]["type"] == "string" + @pytest.mark.unittest class TestGetDataSource: @@ -629,3 +654,33 @@ def test_nested_schema_and_data_alignment(self, nested_test_data): assert "query" in item assert "context" in item assert "company" in item["context"] + + def test_flat_schema_and_data_alignment_with_list_and_dict(self, flat_test_data): + """Test that schema types and data values agree for list/dict columns.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + data_source = _get_data_source(flat_test_data, column_mapping) + + schema_props = config["item_schema"]["properties"] + data_item = data_source["source"]["content"][0][WRAPPER_KEY] + + # Schema declares array → data contains a list + assert schema_props["tags"]["type"] == "array" + assert isinstance(data_item["tags"], list) + + # Schema declares object → data contains a dict + assert schema_props["metadata"]["type"] == "object" + assert isinstance(data_item["metadata"], dict) + + # Empty collections should also align + empty_item = data_source["source"]["content"][2][WRAPPER_KEY] + assert isinstance(empty_item["tags"], list) + assert isinstance(empty_item["metadata"], dict) From bc0c675813734f67d0dddfb409eea875b98f45f2 Mon Sep 17 00:00:00 2001 From: April Kwong Date: Sun, 8 Mar 2026 01:02:22 -0800 Subject: [PATCH 3/6] Fix pass_threshold propagation and zero-threshold logging - Use 'is not None' instead of truthiness check in _build_internal_log_attributes so threshold=0 is not silently dropped. - Propagate _pass_threshold from evaluator_config into testing_criteria_metadata in _extract_testing_criteria_metadata. - Inject pass_threshold into metric results in _process_criteria_metrics when the evaluator (e.g. PythonGrader) does not emit one, without overwriting evaluator-provided thresholds. - Add 12 unit tests covering all three changes including zero-value edge cases. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate.py | 15 +- .../tests/unittests/test_evaluate.py | 157 ++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index b596f4b8430c..44273f687bef 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -1103,7 +1103,7 @@ def _build_internal_log_attributes( # Create a copy of the base log attributes internal_log_attributes: Dict[str, str] = log_attributes.copy() # Add threshold if present - if event_data.get("threshold"): + if event_data.get("threshold") is not None: internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"]) # Add testing criteria details if present @@ -2030,6 +2030,11 @@ def _extract_testing_criteria_metadata( "metrics": metrics, "is_inverse": is_inverse, } + # Propagate pass_threshold from evaluator config so result events can include it + if evaluator_config and criteria_name in evaluator_config: + pass_threshold = evaluator_config[criteria_name].get("_pass_threshold") + if pass_threshold is not None: + testing_criteria_metadata[criteria_name]["pass_threshold"] = pass_threshold return testing_criteria_metadata @@ -2503,6 +2508,14 @@ def _process_criteria_metrics( # Extract metric values result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger) + # Inject threshold from evaluator config when not present in raw results + # (e.g., PythonGrader/code evaluators don't emit a threshold column) + config_threshold = testing_criteria_metadata.get(criteria_name, {}).get("pass_threshold") + if config_threshold is not None: + for metric_values in result_per_metric.values(): + if metric_values.get("threshold") is None: + metric_values["threshold"] = config_threshold + # Convert to result objects results = [] top_sample = {} diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 3a5627abc604..4b6654cbeb9c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -41,6 +41,9 @@ _process_rows, _aggregate_label_defect_metrics, _update_metric_value, + _build_internal_log_attributes, + _extract_testing_criteria_metadata, + _process_criteria_metrics, ) from azure.ai.evaluation._evaluate._utils import _convert_name_map_into_property_entries from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope @@ -1822,3 +1825,157 @@ def test_nan_string_maps_to_none(self, suffix): result = self._call(f"evaluator{suffix}", "nan") token_key = suffix.lstrip("_") assert result["sample"]["usage"][token_key] is None + + +@pytest.mark.unittest +class TestBuildInternalLogAttributesThreshold: + """Tests for _build_internal_log_attributes threshold handling.""" + + def test_threshold_zero_is_included(self): + """Threshold of 0 should be logged, not silently dropped.""" + event_data = {"threshold": 0, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" in attrs + assert attrs["gen_ai.evaluation.threshold"] == "0" + + def test_threshold_zero_float_is_included(self): + """Threshold of 0.0 should be logged.""" + event_data = {"threshold": 0.0, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert attrs["gen_ai.evaluation.threshold"] == "0.0" + + def test_threshold_none_is_excluded(self): + """No threshold key when threshold is None.""" + event_data = {"threshold": None, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" not in attrs + + def test_threshold_missing_is_excluded(self): + """No threshold key when threshold is absent from event_data.""" + event_data = {"name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" not in attrs + + def test_threshold_positive_value(self): + """Normal positive threshold is included.""" + event_data = {"threshold": 3.5, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert attrs["gen_ai.evaluation.threshold"] == "3.5" + + +@pytest.mark.unittest +class TestExtractTestingCriteriaMetadataPassThreshold: + """Tests for pass_threshold propagation in _extract_testing_criteria_metadata.""" + + def test_pass_threshold_propagated_from_config(self): + """pass_threshold from evaluator_config should appear in metadata.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_pass_threshold": 0.8, + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert metadata["my_grader"]["pass_threshold"] == 0.8 + + def test_pass_threshold_zero_propagated(self): + """pass_threshold of 0 should be propagated, not dropped.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_pass_threshold": 0, + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert metadata["my_grader"]["pass_threshold"] == 0 + + def test_pass_threshold_absent_not_added(self): + """When no _pass_threshold in config, metadata should not have pass_threshold.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert "pass_threshold" not in metadata["my_grader"] + + +@pytest.mark.unittest +class TestProcessCriteriaMetricsThresholdInjection: + """Tests for threshold injection in _process_criteria_metrics.""" + + def test_threshold_injected_when_missing(self): + """Metrics without a threshold should get it from pass_threshold in metadata.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 3.0 + + def test_threshold_not_overwritten_when_present(self): + """Metrics that already have a threshold should not be overwritten.""" + metrics = {"score": 4.5, "score_reason": "Good", "score_threshold": 5.0} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 5.0 + + def test_threshold_zero_injected(self): + """pass_threshold of 0 should be injected, not skipped.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 0 + + def test_no_injection_without_pass_threshold(self): + """Without pass_threshold in metadata, threshold should remain None.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0].get("threshold") is None From 40a463b993c30ac66d4178b8beefab0668d40736 Mon Sep 17 00:00:00 2001 From: April Kwong Date: Sun, 8 Mar 2026 01:18:30 -0800 Subject: [PATCH 4/6] Skip None/NaN rows when inferring schema types The flat schema generator now scans past None and NaN values to find the first non-null sample for type inference, instead of only checking iloc[0]. This avoids schema-data mismatches when the first row has missing values but later rows contain lists or dicts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate_aoai.py | 18 +++++++------ .../tests/unittests/test_aoai_data_source.py | 27 +++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index f65b0969027f..e89df9e7adeb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -715,14 +715,16 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di props = data_source_config["item_schema"]["properties"] req = data_source_config["item_schema"]["required"] for key in column_mapping.keys(): - if key in input_data_df and len(input_data_df[key]) > 0: - sample = input_data_df[key].iloc[0] - if isinstance(sample, list): - props[key] = {"type": "array"} - elif isinstance(sample, dict): - props[key] = {"type": "object"} - else: - props[key] = {"type": "string"} + sample = None + if key in input_data_df: + for candidate in input_data_df[key]: + if candidate is not None and not (isinstance(candidate, float) and pd.isna(candidate)): + sample = candidate + break + if isinstance(sample, list): + props[key] = {"type": "array"} + elif isinstance(sample, dict): + props[key] = {"type": "object"} else: props[key] = {"type": "string"} req.append(key) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index b7baf9ce893c..b70c0687d53b 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -322,6 +322,33 @@ def test_flat_schema_infers_list_and_dict_types(self, flat_test_data): # Numerics (converted to str by _convert_value) should be typed as string assert properties["score"]["type"] == "string" + def test_flat_schema_skips_none_nan_for_type_inference(self): + """Test that schema inference skips None/NaN rows to find the real type.""" + import numpy as np + + df = pd.DataFrame( + { + "tags": [None, ["tag1", "tag2"], ["tag3"]], + "metadata": [np.nan, {"key": "val"}, {}], + "query": [None, None, "hello"], + } + ) + column_mapping = { + "tags": "${data.tags}", + "metadata": "${data.metadata}", + "query": "${data.query}", + } + + config = _generate_data_source_config(df, column_mapping) + properties = config["item_schema"]["properties"] + + # Should look past None in row 0 and find list in row 1 + assert properties["tags"]["type"] == "array" + # Should look past NaN in row 0 and find dict in row 1 + assert properties["metadata"]["type"] == "object" + # All None → falls back to string + assert properties["query"]["type"] == "string" + @pytest.mark.unittest class TestGetDataSource: From c8964fc8ada494a6b2737814a360a011322cfd7c Mon Sep 17 00:00:00 2001 From: April Kwong Date: Sun, 8 Mar 2026 01:31:51 -0800 Subject: [PATCH 5/6] Address PR review comments - Use _is_none_or_nan for threshold injection check so NaN thresholds are also replaced by pass_threshold from config. - Use pd.isna with guard for list/dict when skipping null sentinels (handles pd.NA, NaT, etc. in addition to None and float NaN). - Infer leaf types in nested schema via leaf_type_map parameter on _build_schema_tree_from_paths so nested paths with list/dict data get array/object schema types instead of always defaulting to string. - Add tests for leaf_type_map, nested schema type inference, pd.NA handling, and NaN threshold injection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../ai/evaluation/_evaluate/_evaluate.py | 2 +- .../ai/evaluation/_evaluate/_evaluate_aoai.py | 45 +++++++++++-- .../tests/unittests/test_aoai_data_source.py | 64 +++++++++++++++++++ .../tests/unittests/test_evaluate.py | 17 +++++ 4 files changed, 120 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 44273f687bef..f2960059d2e1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -2513,7 +2513,7 @@ def _process_criteria_metrics( config_threshold = testing_criteria_metadata.get(criteria_name, {}).get("pass_threshold") if config_threshold is not None: for metric_values in result_per_metric.values(): - if metric_values.get("threshold") is None: + if _is_none_or_nan(metric_values.get("threshold")): metric_values["threshold"] = config_threshold # Convert to result objects diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index e89df9e7adeb..4a75d6c000a2 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -590,6 +590,7 @@ def _get_graders_and_column_mappings( def _build_schema_tree_from_paths( paths: List[str], force_leaf_type: str = "string", + leaf_type_map: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: """ Build a nested JSON schema (object) from a list of dot-delimited paths. @@ -629,13 +630,16 @@ def _build_schema_tree_from_paths( :param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node produced from the supplied paths. Defaults to ``"string"``. :type force_leaf_type: str + :param leaf_type_map: Optional mapping from leaf path to JSON Schema type. When + provided, overrides ``force_leaf_type`` for any path present in this map. + :type leaf_type_map: Optional[Dict[str, str]] :return: A JSON Schema fragment describing the hierarchical structure implied by the input paths. The returned schema root always has ``type: object`` with recursively nested ``properties`` / ``required`` keys. :rtype: Dict[str, Any] """ - # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool } - root: Dict[str, Any] = {"__children__": {}, "__leaf__": False} + # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool, "__path__": str } + root: Dict[str, Any] = {"__children__": {}, "__leaf__": False, "__path__": ""} def insert(path: str): parts = [p for p in path.split(".") if p] @@ -643,19 +647,23 @@ def insert(path: str): for i, part in enumerate(parts): children = node["__children__"] if part not in children: - children[part] = {"__children__": {}, "__leaf__": False} + children[part] = {"__children__": {}, "__leaf__": False, "__path__": ""} node = children[part] if i == len(parts) - 1: node["__leaf__"] = True + node["__path__"] = path for p in paths: insert(p) + _leaf_types = leaf_type_map or {} + def to_schema(node: Dict[str, Any]) -> Dict[str, Any]: children = node["__children__"] if not children: - # Leaf node - return {"type": force_leaf_type} + # Leaf node — use per-leaf type if available, else force_leaf_type + leaf_type = _leaf_types.get(node["__path__"], force_leaf_type) + return {"type": leaf_type} props = {} required = [] for name, child in children.items(): @@ -718,7 +726,15 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di sample = None if key in input_data_df: for candidate in input_data_df[key]: - if candidate is not None and not (isinstance(candidate, float) and pd.isna(candidate)): + # Skip null-like scalar values (None, NaN, pd.NA, NaT, etc.) + if isinstance(candidate, (list, dict)): + sample = candidate + break + try: + if candidate is not None and not pd.isna(candidate): + sample = candidate + break + except (TypeError, ValueError): sample = candidate break if isinstance(sample, list): @@ -762,7 +778,22 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}") LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...") - nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string") + + # Infer leaf types from the DataFrame so nested schemas also get array/object types + leaf_type_map: Dict[str, str] = {} + for ref_path, eff_path in zip(referenced_paths, effective_paths if strip_wrapper else referenced_paths): + if ref_path in input_data_df: + for candidate in input_data_df[ref_path]: + if isinstance(candidate, (list, dict)): + leaf_type_map[eff_path] = "array" if isinstance(candidate, list) else "object" + break + try: + if candidate is not None and not pd.isna(candidate): + break + except (TypeError, ValueError): + break + + nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string", leaf_type_map=leaf_type_map) LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'") return { diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index b70c0687d53b..55dbbe3b13b8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -171,6 +171,26 @@ def test_mixed_depth_paths(self): assert nested["type"] == "object" assert "field" in nested["properties"] + def test_leaf_type_map_overrides_force_leaf_type(self): + """Test that leaf_type_map overrides force_leaf_type for specific paths.""" + paths = ["query", "tags", "metadata"] + leaf_type_map = {"tags": "array", "metadata": "object"} + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string", leaf_type_map=leaf_type_map) + + assert schema["properties"]["query"]["type"] == "string" + assert schema["properties"]["tags"]["type"] == "array" + assert schema["properties"]["metadata"]["type"] == "object" + + def test_leaf_type_map_nested_paths(self): + """Test leaf_type_map with nested paths.""" + paths = ["context.tags", "context.query"] + leaf_type_map = {"context.tags": "array"} + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string", leaf_type_map=leaf_type_map) + + context = schema["properties"]["context"] + assert context["properties"]["tags"]["type"] == "array" + assert context["properties"]["query"]["type"] == "string" + @pytest.mark.unittest class TestGenerateDataSourceConfig: @@ -349,6 +369,50 @@ def test_flat_schema_skips_none_nan_for_type_inference(self): # All None → falls back to string assert properties["query"]["type"] == "string" + def test_flat_schema_skips_pd_na_for_type_inference(self): + """Test that schema inference skips pd.NA sentinel values.""" + df = pd.DataFrame( + { + "tags": [pd.NA, ["tag1", "tag2"], ["tag3"]], + "query": ["hello", "world", "test"], + } + ) + column_mapping = { + "tags": "${data.tags}", + "query": "${data.query}", + } + + config = _generate_data_source_config(df, column_mapping) + properties = config["item_schema"]["properties"] + + assert properties["tags"]["type"] == "array" + assert properties["query"]["type"] == "string" + + def test_nested_schema_infers_list_and_dict_leaf_types(self): + """Test that nested schema infers array/object types for leaf nodes.""" + df = pd.DataFrame( + [ + { + "item.query": "hello", + "item.tags": ["tag1", "tag2"], + "item.metadata": {"key": "val"}, + } + ] + ) + column_mapping = { + "query": "${data.item.query}", + "tags": "${data.item.tags}", + "metadata": "${data.item.metadata}", + } + + config = _generate_data_source_config(df, column_mapping) + schema = config["item_schema"] + + # After wrapper stripping, leaves should have inferred types + assert schema["properties"]["query"]["type"] == "string" + assert schema["properties"]["tags"]["type"] == "array" + assert schema["properties"]["metadata"]["type"] == "object" + @pytest.mark.unittest class TestGetDataSource: diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 4b6654cbeb9c..47ef67eb4baa 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -1979,3 +1979,20 @@ def test_no_injection_without_pass_threshold(self): ) assert len(results) > 0 assert results[0].get("threshold") is None + + def test_nan_threshold_gets_injected(self): + """NaN threshold should be replaced by pass_threshold from config.""" + metrics = {"score": 4.5, "score_reason": "Good", "score_threshold": float("nan")} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 3.0 From ef69f45675bd5eaa29682b756499028e2849f7e3 Mon Sep 17 00:00:00 2001 From: April Kwong Date: Mon, 9 Mar 2026 03:59:45 -0700 Subject: [PATCH 6/6] Apply black formatting to pass CI checks Use line-length=120 from eng/black-pyproject.toml config. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../azure/ai/evaluation/_evaluate/_evaluate_aoai.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index 4a75d6c000a2..b0d3fb405746 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -793,7 +793,9 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di except (TypeError, ValueError): break - nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string", leaf_type_map=leaf_type_map) + nested_schema = _build_schema_tree_from_paths( + effective_paths, force_leaf_type="string", leaf_type_map=leaf_type_map + ) LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'") return {