diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index b596f4b8430c..f2960059d2e1 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -1103,7 +1103,7 @@ def _build_internal_log_attributes( # Create a copy of the base log attributes internal_log_attributes: Dict[str, str] = log_attributes.copy() # Add threshold if present - if event_data.get("threshold"): + if event_data.get("threshold") is not None: internal_log_attributes["gen_ai.evaluation.threshold"] = str(event_data["threshold"]) # Add testing criteria details if present @@ -2030,6 +2030,11 @@ def _extract_testing_criteria_metadata( "metrics": metrics, "is_inverse": is_inverse, } + # Propagate pass_threshold from evaluator config so result events can include it + if evaluator_config and criteria_name in evaluator_config: + pass_threshold = evaluator_config[criteria_name].get("_pass_threshold") + if pass_threshold is not None: + testing_criteria_metadata[criteria_name]["pass_threshold"] = pass_threshold return testing_criteria_metadata @@ -2503,6 +2508,14 @@ def _process_criteria_metrics( # Extract metric values result_per_metric = _extract_metric_values(criteria_name, criteria_type, metrics, expected_metrics, logger) + # Inject threshold from evaluator config when not present in raw results + # (e.g., PythonGrader/code evaluators don't emit a threshold column) + config_threshold = testing_criteria_metadata.get(criteria_name, {}).get("pass_threshold") + if config_threshold is not None: + for metric_values in result_per_metric.values(): + if _is_none_or_nan(metric_values.get("threshold")): + metric_values["threshold"] = config_threshold + # Convert to result objects results = [] top_sample = {} diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index a548fc529ab4..b0d3fb405746 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -590,6 +590,7 @@ def _get_graders_and_column_mappings( def _build_schema_tree_from_paths( paths: List[str], force_leaf_type: str = "string", + leaf_type_map: Optional[Dict[str, str]] = None, ) -> Dict[str, Any]: """ Build a nested JSON schema (object) from a list of dot-delimited paths. @@ -629,13 +630,16 @@ def _build_schema_tree_from_paths( :param force_leaf_type: The JSON Schema ``type`` value to assign to every leaf node produced from the supplied paths. Defaults to ``"string"``. :type force_leaf_type: str + :param leaf_type_map: Optional mapping from leaf path to JSON Schema type. When + provided, overrides ``force_leaf_type`` for any path present in this map. + :type leaf_type_map: Optional[Dict[str, str]] :return: A JSON Schema fragment describing the hierarchical structure implied by the input paths. The returned schema root always has ``type: object`` with recursively nested ``properties`` / ``required`` keys. :rtype: Dict[str, Any] """ - # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool } - root: Dict[str, Any] = {"__children__": {}, "__leaf__": False} + # Build tree where each node: {"__children__": { segment: node, ... }, "__leaf__": bool, "__path__": str } + root: Dict[str, Any] = {"__children__": {}, "__leaf__": False, "__path__": ""} def insert(path: str): parts = [p for p in path.split(".") if p] @@ -643,19 +647,23 @@ def insert(path: str): for i, part in enumerate(parts): children = node["__children__"] if part not in children: - children[part] = {"__children__": {}, "__leaf__": False} + children[part] = {"__children__": {}, "__leaf__": False, "__path__": ""} node = children[part] if i == len(parts) - 1: node["__leaf__"] = True + node["__path__"] = path for p in paths: insert(p) + _leaf_types = leaf_type_map or {} + def to_schema(node: Dict[str, Any]) -> Dict[str, Any]: children = node["__children__"] if not children: - # Leaf node - return {"type": force_leaf_type} + # Leaf node — use per-leaf type if available, else force_leaf_type + leaf_type = _leaf_types.get(node["__path__"], force_leaf_type) + return {"type": leaf_type} props = {} required = [] for name, child in children.items(): @@ -715,8 +723,24 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di props = data_source_config["item_schema"]["properties"] req = data_source_config["item_schema"]["required"] for key in column_mapping.keys(): - if key in input_data_df and len(input_data_df[key]) > 0 and isinstance(input_data_df[key].iloc[0], list): + sample = None + if key in input_data_df: + for candidate in input_data_df[key]: + # Skip null-like scalar values (None, NaN, pd.NA, NaT, etc.) + if isinstance(candidate, (list, dict)): + sample = candidate + break + try: + if candidate is not None and not pd.isna(candidate): + sample = candidate + break + except (TypeError, ValueError): + sample = candidate + break + if isinstance(sample, list): props[key] = {"type": "array"} + elif isinstance(sample, dict): + props[key] = {"type": "object"} else: props[key] = {"type": "string"} req.append(key) @@ -754,7 +778,24 @@ def _generate_data_source_config(input_data_df: pd.DataFrame, column_mapping: Di LOGGER.info(f"AOAI: Effective paths after stripping wrapper: {effective_paths}") LOGGER.info(f"AOAI: Building nested schema from {len(effective_paths)} effective paths...") - nested_schema = _build_schema_tree_from_paths(effective_paths, force_leaf_type="string") + + # Infer leaf types from the DataFrame so nested schemas also get array/object types + leaf_type_map: Dict[str, str] = {} + for ref_path, eff_path in zip(referenced_paths, effective_paths if strip_wrapper else referenced_paths): + if ref_path in input_data_df: + for candidate in input_data_df[ref_path]: + if isinstance(candidate, (list, dict)): + leaf_type_map[eff_path] = "array" if isinstance(candidate, list) else "object" + break + try: + if candidate is not None and not pd.isna(candidate): + break + except (TypeError, ValueError): + break + + nested_schema = _build_schema_tree_from_paths( + effective_paths, force_leaf_type="string", leaf_type_map=leaf_type_map + ) LOGGER.info(f"AOAI: Nested schema generated successfully with type '{nested_schema.get('type')}'") return { @@ -816,9 +857,9 @@ def _convert_value(val: Any) -> Any: if isinstance(val, bool): return val # Align numerics with legacy text-only JSONL payloads by turning them into strings. - if isinstance(val, (int, float, list)): + if isinstance(val, (int, float)): return str(val) - if isinstance(val, (dict)): + if isinstance(val, (list, dict)): return val return str(val) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py index 6d77e098eaba..55dbbe3b13b8 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_data_source.py @@ -171,6 +171,26 @@ def test_mixed_depth_paths(self): assert nested["type"] == "object" assert "field" in nested["properties"] + def test_leaf_type_map_overrides_force_leaf_type(self): + """Test that leaf_type_map overrides force_leaf_type for specific paths.""" + paths = ["query", "tags", "metadata"] + leaf_type_map = {"tags": "array", "metadata": "object"} + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string", leaf_type_map=leaf_type_map) + + assert schema["properties"]["query"]["type"] == "string" + assert schema["properties"]["tags"]["type"] == "array" + assert schema["properties"]["metadata"]["type"] == "object" + + def test_leaf_type_map_nested_paths(self): + """Test leaf_type_map with nested paths.""" + paths = ["context.tags", "context.query"] + leaf_type_map = {"context.tags": "array"} + schema = _build_schema_tree_from_paths(paths, force_leaf_type="string", leaf_type_map=leaf_type_map) + + context = schema["properties"]["context"] + assert context["properties"]["tags"]["type"] == "array" + assert context["properties"]["query"]["type"] == "string" + @pytest.mark.unittest class TestGenerateDataSourceConfig: @@ -297,6 +317,102 @@ def test_single_nested_path(self, flat_test_data): # After wrapper stripping, should see context assert "context" in schema["properties"] + def test_flat_schema_infers_list_and_dict_types(self, flat_test_data): + """Test that flat schema correctly infers array/object types from data.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + flat_test_data["score"] = [95, 87, 92] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + "score": "${data.score}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + + properties = config["item_schema"]["properties"] + # Strings should be typed as string + assert properties["query"]["type"] == "string" + # Lists should be typed as array + assert properties["tags"]["type"] == "array" + # Dicts should be typed as object + assert properties["metadata"]["type"] == "object" + # Numerics (converted to str by _convert_value) should be typed as string + assert properties["score"]["type"] == "string" + + def test_flat_schema_skips_none_nan_for_type_inference(self): + """Test that schema inference skips None/NaN rows to find the real type.""" + import numpy as np + + df = pd.DataFrame( + { + "tags": [None, ["tag1", "tag2"], ["tag3"]], + "metadata": [np.nan, {"key": "val"}, {}], + "query": [None, None, "hello"], + } + ) + column_mapping = { + "tags": "${data.tags}", + "metadata": "${data.metadata}", + "query": "${data.query}", + } + + config = _generate_data_source_config(df, column_mapping) + properties = config["item_schema"]["properties"] + + # Should look past None in row 0 and find list in row 1 + assert properties["tags"]["type"] == "array" + # Should look past NaN in row 0 and find dict in row 1 + assert properties["metadata"]["type"] == "object" + # All None → falls back to string + assert properties["query"]["type"] == "string" + + def test_flat_schema_skips_pd_na_for_type_inference(self): + """Test that schema inference skips pd.NA sentinel values.""" + df = pd.DataFrame( + { + "tags": [pd.NA, ["tag1", "tag2"], ["tag3"]], + "query": ["hello", "world", "test"], + } + ) + column_mapping = { + "tags": "${data.tags}", + "query": "${data.query}", + } + + config = _generate_data_source_config(df, column_mapping) + properties = config["item_schema"]["properties"] + + assert properties["tags"]["type"] == "array" + assert properties["query"]["type"] == "string" + + def test_nested_schema_infers_list_and_dict_leaf_types(self): + """Test that nested schema infers array/object types for leaf nodes.""" + df = pd.DataFrame( + [ + { + "item.query": "hello", + "item.tags": ["tag1", "tag2"], + "item.metadata": {"key": "val"}, + } + ] + ) + column_mapping = { + "query": "${data.item.query}", + "tags": "${data.item.tags}", + "metadata": "${data.item.metadata}", + } + + config = _generate_data_source_config(df, column_mapping) + schema = config["item_schema"] + + # After wrapper stripping, leaves should have inferred types + assert schema["properties"]["query"]["type"] == "string" + assert schema["properties"]["tags"]["type"] == "array" + assert schema["properties"]["metadata"]["type"] == "object" + @pytest.mark.unittest class TestGetDataSource: @@ -437,7 +553,7 @@ def test_data_source_with_item_column_and_nested_values(self, nested_item_keywor # Ensure we did not accidentally nest another 'item' key inside the wrapper assert "item" not in item_payload assert item_payload["sample"]["output_text"] == "someoutput" - assert item_payload["sample"]["output_items"] == "['item1', 'item2']" + assert item_payload["sample"]["output_items"] == ["item1", "item2"] def test_data_source_with_item_sample_column_and_nested_values(self, nested_item_sample_keyword_data): """Ensure rows that already have an 'item' column keep nested dicts intact.""" @@ -464,7 +580,7 @@ def test_data_source_with_item_sample_column_and_nested_values(self, nested_item # Ensure we did not accidentally nest another 'item' key inside the wrapper assert "item" not in item_payload assert item_payload["sample"]["output_text"] == "someoutput" - assert item_payload["sample"]["output_items"] == "['item1', 'item2']" + assert item_payload["sample"]["output_items"] == ["item1", "item2"] def test_data_source_with_sample_output_metadata(self, flat_sample_output_data): """Ensure flat rows that include dotted sample metadata remain accessible.""" @@ -485,7 +601,7 @@ def test_data_source_with_sample_output_metadata(self, flat_sample_output_data): assert row["test"]["test_string"] == "baking cakes is fun!" # sample.output_text should follow the row through normalization without being stringified assert row["sample.output_text"] == "someoutput" - assert row["sample.output_items"] == "['item1', 'item2']" + assert row["sample.output_items"] == ["item1", "item2"] def test_data_source_with_numeric_values(self, flat_test_data): """Test data source generation converts numeric values to strings.""" @@ -504,6 +620,35 @@ def test_data_source_with_numeric_values(self, flat_test_data): assert isinstance(content[0][WRAPPER_KEY]["score"], str) assert isinstance(content[0][WRAPPER_KEY]["confidence"], str) + def test_data_source_with_list_and_dict_values(self, flat_test_data): + """Test data source generation preserves list and dict values as-is.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + } + + data_source = _get_data_source(flat_test_data, column_mapping) + + content = data_source["source"]["content"] + + # Lists should be preserved as lists, not stringified + assert content[0][WRAPPER_KEY]["tags"] == ["tag1", "tag2"] + assert isinstance(content[0][WRAPPER_KEY]["tags"], list) + # Empty lists should also be preserved + assert content[2][WRAPPER_KEY]["tags"] == [] + assert isinstance(content[2][WRAPPER_KEY]["tags"], list) + + # Dicts should be preserved as dicts + assert content[0][WRAPPER_KEY]["metadata"] == {"key": "val"} + assert isinstance(content[0][WRAPPER_KEY]["metadata"], dict) + # Empty dicts should also be preserved + assert content[2][WRAPPER_KEY]["metadata"] == {} + assert isinstance(content[2][WRAPPER_KEY]["metadata"], dict) + def test_empty_dataframe(self): """Test data source generation with empty dataframe.""" empty_df = pd.DataFrame() @@ -600,3 +745,33 @@ def test_nested_schema_and_data_alignment(self, nested_test_data): assert "query" in item assert "context" in item assert "company" in item["context"] + + def test_flat_schema_and_data_alignment_with_list_and_dict(self, flat_test_data): + """Test that schema types and data values agree for list/dict columns.""" + flat_test_data["tags"] = [["tag1", "tag2"], ["tag3"], []] + flat_test_data["metadata"] = [{"key": "val"}, {"key2": "val2"}, {}] + + column_mapping = { + "query": "${data.query}", + "tags": "${data.tags}", + "metadata": "${data.metadata}", + } + + config = _generate_data_source_config(flat_test_data, column_mapping) + data_source = _get_data_source(flat_test_data, column_mapping) + + schema_props = config["item_schema"]["properties"] + data_item = data_source["source"]["content"][0][WRAPPER_KEY] + + # Schema declares array → data contains a list + assert schema_props["tags"]["type"] == "array" + assert isinstance(data_item["tags"], list) + + # Schema declares object → data contains a dict + assert schema_props["metadata"]["type"] == "object" + assert isinstance(data_item["metadata"], dict) + + # Empty collections should also align + empty_item = data_source["source"]["content"][2][WRAPPER_KEY] + assert isinstance(empty_item["tags"], list) + assert isinstance(empty_item["metadata"], dict) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 3a5627abc604..47ef67eb4baa 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -41,6 +41,9 @@ _process_rows, _aggregate_label_defect_metrics, _update_metric_value, + _build_internal_log_attributes, + _extract_testing_criteria_metadata, + _process_criteria_metrics, ) from azure.ai.evaluation._evaluate._utils import _convert_name_map_into_property_entries from azure.ai.evaluation._evaluate._utils import _apply_column_mapping, _trace_destination_from_project_scope @@ -1822,3 +1825,174 @@ def test_nan_string_maps_to_none(self, suffix): result = self._call(f"evaluator{suffix}", "nan") token_key = suffix.lstrip("_") assert result["sample"]["usage"][token_key] is None + + +@pytest.mark.unittest +class TestBuildInternalLogAttributesThreshold: + """Tests for _build_internal_log_attributes threshold handling.""" + + def test_threshold_zero_is_included(self): + """Threshold of 0 should be logged, not silently dropped.""" + event_data = {"threshold": 0, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" in attrs + assert attrs["gen_ai.evaluation.threshold"] == "0" + + def test_threshold_zero_float_is_included(self): + """Threshold of 0.0 should be logged.""" + event_data = {"threshold": 0.0, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert attrs["gen_ai.evaluation.threshold"] == "0.0" + + def test_threshold_none_is_excluded(self): + """No threshold key when threshold is None.""" + event_data = {"threshold": None, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" not in attrs + + def test_threshold_missing_is_excluded(self): + """No threshold key when threshold is absent from event_data.""" + event_data = {"name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert "gen_ai.evaluation.threshold" not in attrs + + def test_threshold_positive_value(self): + """Normal positive threshold is included.""" + event_data = {"threshold": 3.5, "name": "my_grader"} + attrs = _build_internal_log_attributes(event_data, "score", None, {}) + assert attrs["gen_ai.evaluation.threshold"] == "3.5" + + +@pytest.mark.unittest +class TestExtractTestingCriteriaMetadataPassThreshold: + """Tests for pass_threshold propagation in _extract_testing_criteria_metadata.""" + + def test_pass_threshold_propagated_from_config(self): + """pass_threshold from evaluator_config should appear in metadata.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_pass_threshold": 0.8, + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert metadata["my_grader"]["pass_threshold"] == 0.8 + + def test_pass_threshold_zero_propagated(self): + """pass_threshold of 0 should be propagated, not dropped.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_pass_threshold": 0, + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert metadata["my_grader"]["pass_threshold"] == 0 + + def test_pass_threshold_absent_not_added(self): + """When no _pass_threshold in config, metadata should not have pass_threshold.""" + evaluators = {"my_grader": lambda **kwargs: {"score": 1}} + evaluator_config = { + "my_grader": { + "_evaluator_definition": {"metrics": {"score": {"type": "numeric"}}}, + } + } + metadata = _extract_testing_criteria_metadata( + evaluators, None, evaluator_config, logging.getLogger(), None, None + ) + assert "pass_threshold" not in metadata["my_grader"] + + +@pytest.mark.unittest +class TestProcessCriteriaMetricsThresholdInjection: + """Tests for threshold injection in _process_criteria_metrics.""" + + def test_threshold_injected_when_missing(self): + """Metrics without a threshold should get it from pass_threshold in metadata.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 3.0 + + def test_threshold_not_overwritten_when_present(self): + """Metrics that already have a threshold should not be overwritten.""" + metrics = {"score": 4.5, "score_reason": "Good", "score_threshold": 5.0} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 5.0 + + def test_threshold_zero_injected(self): + """pass_threshold of 0 should be injected, not skipped.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 0 + + def test_no_injection_without_pass_threshold(self): + """Without pass_threshold in metadata, threshold should remain None.""" + metrics = {"score": 4.5, "score_reason": "Good"} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0].get("threshold") is None + + def test_nan_threshold_gets_injected(self): + """NaN threshold should be replaced by pass_threshold from config.""" + metrics = {"score": 4.5, "score_reason": "Good", "score_threshold": float("nan")} + testing_criteria_metadata = { + "coherence": { + "metrics": ["score"], + "type": "quality", + "is_inverse": False, + "pass_threshold": 3.0, + } + } + results, _ = _process_criteria_metrics( + "coherence", metrics, testing_criteria_metadata, logging.getLogger(), None, None + ) + assert len(results) > 0 + assert results[0]["threshold"] == 3.0