diff --git a/README.md b/README.md index 799536c3..c9fa4d08 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,7 @@ git clone https://github.com/The-OpenROAD-Project/ORAssistant.git ``` **Step 2**: Copy the `.env.example` file, and update your `.env` file with the appropriate API keys. +> **Note:** When configuring `EMBEDDINGS_TYPE`, valid options are `HF`, `GOOGLE_GENAI`, or `GOOGLE_VERTEXAI`. If using `HF`, no API key is needed but startup is slower. If using `GOOGLE_GENAI`, ensure `GOOGLE_API_KEY` is set. Modify the Docker `HEALTHCHECK_` variables based on the hardware requirements. If you have a resource-constrained PC, try increasing `HEALTHCHECK_START_PERIOD` to a value large @@ -99,6 +100,7 @@ uv sync ``` **Step 2**: Copy the `.env.example` file, and update your `.env` file with the appropriate API keys. +> **Note:** When configuring `EMBEDDINGS_TYPE`, valid options are `HF`, `GOOGLE_GENAI`, or `GOOGLE_VERTEXAI`. If using `HF`, no API key is needed but startup is slower. If using `GOOGLE_GENAI`, ensure `GOOGLE_API_KEY` is set. ```bash cd backend diff --git a/backend/.env.example b/backend/.env.example index 5ce3d60e..3a0848ce 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -22,6 +22,7 @@ GOOGLE_GEMINI=2.0_flash LLM_TEMP=1 +# Choose embeddings type (options: HF, GOOGLE_GENAI, GOOGLE_VERTEXAI) EMBEDDINGS_TYPE=GOOGLE_GENAI GOOGLE_EMBEDDINGS=gemini-embedding-001 HF_EMBEDDINGS=thenlper/gte-large diff --git a/evaluation/auto_evaluation/tests/conftest.py b/evaluation/auto_evaluation/tests/conftest.py new file mode 100644 index 00000000..442ec292 --- /dev/null +++ b/evaluation/auto_evaluation/tests/conftest.py @@ -0,0 +1,4 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) diff --git a/evaluation/auto_evaluation/tests/test_metrics.py b/evaluation/auto_evaluation/tests/test_metrics.py new file mode 100644 index 00000000..d2b853be --- /dev/null +++ b/evaluation/auto_evaluation/tests/test_metrics.py @@ -0,0 +1,177 @@ +""" +Unit tests for evaluation metrics module. +Tests verify metric factory functions return correct types, +use correct thresholds, and handle NotImplementedError cases. +""" +import pytest +from unittest.mock import MagicMock +from deepeval.metrics import ( + ContextualPrecisionMetric, + ContextualRecallMetric, + HallucinationMetric, + AnswerRelevancyMetric, + BiasMetric, + ToxicityMetric, + GEval, +) +from deepeval.models.base_model import DeepEvalBaseLLM + +from src.metrics.retrieval import ( + make_contextual_precision_metric, + make_contextual_recall_metric, + make_contextual_relevancy_metric, + make_faithfulness_metric, + make_hallucination_metric, + PRECISION_THRESHOLD, + RECALL_THRESHOLD, + HALLUCINATION_THRESHOLD, +) +from src.metrics.content import ( + make_answer_relevancy_metric, + make_bias_metric, + make_toxicity_metric, + ANSRELEVANCY_THRESHOLD, + BIAS_THRESHOLD, + TOXICITY_THRESHOLD, +) +from src.metrics.geval import make_correctness_metric + + +@pytest.fixture +def mock_model(): + """Mock DeepEvalBaseLLM model for testing.""" + return MagicMock(spec=DeepEvalBaseLLM) + + +class TestRetrievalMetrics: + """Tests for retrieval-based evaluation metrics.""" + + def test_make_contextual_precision_metric_returns_correct_type(self, mock_model): + metric = make_contextual_precision_metric(mock_model) + assert isinstance(metric, ContextualPrecisionMetric) + + def test_make_contextual_precision_metric_threshold(self, mock_model): + metric = make_contextual_precision_metric(mock_model) + assert metric.threshold == PRECISION_THRESHOLD + + def test_make_contextual_precision_metric_includes_reason(self, mock_model): + metric = make_contextual_precision_metric(mock_model) + assert metric.include_reason is True + + def test_make_contextual_recall_metric_returns_correct_type(self, mock_model): + metric = make_contextual_recall_metric(mock_model) + assert isinstance(metric, ContextualRecallMetric) + + def test_make_contextual_recall_metric_threshold(self, mock_model): + metric = make_contextual_recall_metric(mock_model) + assert metric.threshold == RECALL_THRESHOLD + + def test_make_contextual_recall_metric_includes_reason(self, mock_model): + metric = make_contextual_recall_metric(mock_model) + assert metric.include_reason is True + + def test_make_hallucination_metric_returns_correct_type(self, mock_model): + metric = make_hallucination_metric(mock_model) + assert isinstance(metric, HallucinationMetric) + + def test_make_hallucination_metric_threshold(self, mock_model): + metric = make_hallucination_metric(mock_model) + assert metric.threshold == HALLUCINATION_THRESHOLD + + def test_make_hallucination_metric_includes_reason(self, mock_model): + metric = make_hallucination_metric(mock_model) + assert metric.include_reason is True + + def test_make_contextual_relevancy_metric_raises_not_implemented(self, mock_model): + """ContextualRelevancyMetric is disabled due to protobuf incompatibility.""" + with pytest.raises(NotImplementedError, match="protobuf incompatability"): + make_contextual_relevancy_metric(mock_model) + + def test_make_faithfulness_metric_raises_not_implemented(self, mock_model): + """FaithfulnessMetric is disabled due to protobuf incompatibility.""" + with pytest.raises(NotImplementedError, match="protobuf incompatability"): + make_faithfulness_metric(mock_model) + + +class TestContentMetrics: + """Tests for content-based evaluation metrics.""" + + def test_make_answer_relevancy_metric_returns_correct_type(self, mock_model): + metric = make_answer_relevancy_metric(mock_model) + assert isinstance(metric, AnswerRelevancyMetric) + + def test_make_answer_relevancy_metric_threshold(self, mock_model): + metric = make_answer_relevancy_metric(mock_model) + assert metric.threshold == ANSRELEVANCY_THRESHOLD + + def test_make_answer_relevancy_metric_includes_reason(self, mock_model): + metric = make_answer_relevancy_metric(mock_model) + assert metric.include_reason is True + + def test_make_bias_metric_returns_correct_type(self, mock_model): + metric = make_bias_metric(mock_model) + assert isinstance(metric, BiasMetric) + + def test_make_bias_metric_threshold(self, mock_model): + metric = make_bias_metric(mock_model) + assert metric.threshold == BIAS_THRESHOLD + + def test_make_bias_metric_includes_reason(self, mock_model): + metric = make_bias_metric(mock_model) + assert metric.include_reason is True + + def test_make_toxicity_metric_returns_correct_type(self, mock_model): + metric = make_toxicity_metric(mock_model) + assert isinstance(metric, ToxicityMetric) + + def test_make_toxicity_metric_threshold(self, mock_model): + metric = make_toxicity_metric(mock_model) + assert metric.threshold == TOXICITY_THRESHOLD + + def test_make_toxicity_metric_includes_reason(self, mock_model): + metric = make_toxicity_metric(mock_model) + assert metric.include_reason is True + + +class TestGEvalMetrics: + """Tests for GEval custom LLM-based metrics.""" + + def test_make_correctness_metric_returns_geval(self, mock_model): + metric = make_correctness_metric(mock_model) + assert isinstance(metric, GEval) + + def test_make_correctness_metric_name(self, mock_model): + metric = make_correctness_metric(mock_model) + assert metric.name == "Correctness" + + def test_make_correctness_metric_has_evaluation_steps(self, mock_model): + metric = make_correctness_metric(mock_model) + assert metric.evaluation_steps is not None + assert len(metric.evaluation_steps) > 0 + + def test_make_correctness_metric_has_criteria(self, mock_model): + metric = make_correctness_metric(mock_model) + assert metric.criteria is not None + assert "factually correct" in metric.criteria + + +class TestThresholdValues: + """Tests to verify threshold constants are within valid range.""" + + def test_precision_threshold_valid(self): + assert 0.0 <= PRECISION_THRESHOLD <= 1.0 + + def test_recall_threshold_valid(self): + assert 0.0 <= RECALL_THRESHOLD <= 1.0 + + def test_hallucination_threshold_valid(self): + assert 0.0 <= HALLUCINATION_THRESHOLD <= 1.0 + + def test_answer_relevancy_threshold_valid(self): + assert 0.0 <= ANSRELEVANCY_THRESHOLD <= 1.0 + + def test_bias_threshold_valid(self): + assert 0.0 <= BIAS_THRESHOLD <= 1.0 + + def test_toxicity_threshold_valid(self): + assert 0.0 <= TOXICITY_THRESHOLD <= 1.0