Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ git clone https://github.com/The-OpenROAD-Project/ORAssistant.git
```

**Step 2**: Copy the `.env.example` file, and update your `.env` file with the appropriate API keys.
> **Note:** When configuring `EMBEDDINGS_TYPE`, valid options are `HF`, `GOOGLE_GENAI`, or `GOOGLE_VERTEXAI`. If using `HF`, no API key is needed but startup is slower. If using `GOOGLE_GENAI`, ensure `GOOGLE_API_KEY` is set.

Modify the Docker `HEALTHCHECK_` variables based on the hardware requirements.
If you have a resource-constrained PC, try increasing `HEALTHCHECK_START_PERIOD` to a value large
Expand Down Expand Up @@ -99,6 +100,7 @@ uv sync
```

**Step 2**: Copy the `.env.example` file, and update your `.env` file with the appropriate API keys.
> **Note:** When configuring `EMBEDDINGS_TYPE`, valid options are `HF`, `GOOGLE_GENAI`, or `GOOGLE_VERTEXAI`. If using `HF`, no API key is needed but startup is slower. If using `GOOGLE_GENAI`, ensure `GOOGLE_API_KEY` is set.

```bash
cd backend
Expand Down
1 change: 1 addition & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ GOOGLE_GEMINI=2.0_flash

LLM_TEMP=1

# Choose embeddings type (options: HF, GOOGLE_GENAI, GOOGLE_VERTEXAI)
EMBEDDINGS_TYPE=GOOGLE_GENAI
GOOGLE_EMBEDDINGS=gemini-embedding-001
HF_EMBEDDINGS=thenlper/gte-large
Expand Down
4 changes: 4 additions & 0 deletions evaluation/auto_evaluation/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
177 changes: 177 additions & 0 deletions evaluation/auto_evaluation/tests/test_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""
Unit tests for evaluation metrics module.
Tests verify metric factory functions return correct types,
use correct thresholds, and handle NotImplementedError cases.
"""
import pytest
from unittest.mock import MagicMock
from deepeval.metrics import (
ContextualPrecisionMetric,
ContextualRecallMetric,
HallucinationMetric,
AnswerRelevancyMetric,
BiasMetric,
ToxicityMetric,
GEval,
)
from deepeval.models.base_model import DeepEvalBaseLLM

from src.metrics.retrieval import (
make_contextual_precision_metric,
make_contextual_recall_metric,
make_contextual_relevancy_metric,
make_faithfulness_metric,
make_hallucination_metric,
PRECISION_THRESHOLD,
RECALL_THRESHOLD,
HALLUCINATION_THRESHOLD,
)
from src.metrics.content import (
make_answer_relevancy_metric,
make_bias_metric,
make_toxicity_metric,
ANSRELEVANCY_THRESHOLD,
BIAS_THRESHOLD,
TOXICITY_THRESHOLD,
)
from src.metrics.geval import make_correctness_metric


@pytest.fixture
def mock_model():
"""Mock DeepEvalBaseLLM model for testing."""
return MagicMock(spec=DeepEvalBaseLLM)


class TestRetrievalMetrics:
"""Tests for retrieval-based evaluation metrics."""

def test_make_contextual_precision_metric_returns_correct_type(self, mock_model):
metric = make_contextual_precision_metric(mock_model)
assert isinstance(metric, ContextualPrecisionMetric)

def test_make_contextual_precision_metric_threshold(self, mock_model):
metric = make_contextual_precision_metric(mock_model)
assert metric.threshold == PRECISION_THRESHOLD

def test_make_contextual_precision_metric_includes_reason(self, mock_model):
metric = make_contextual_precision_metric(mock_model)
assert metric.include_reason is True

def test_make_contextual_recall_metric_returns_correct_type(self, mock_model):
metric = make_contextual_recall_metric(mock_model)
assert isinstance(metric, ContextualRecallMetric)

def test_make_contextual_recall_metric_threshold(self, mock_model):
metric = make_contextual_recall_metric(mock_model)
assert metric.threshold == RECALL_THRESHOLD

def test_make_contextual_recall_metric_includes_reason(self, mock_model):
metric = make_contextual_recall_metric(mock_model)
assert metric.include_reason is True

def test_make_hallucination_metric_returns_correct_type(self, mock_model):
metric = make_hallucination_metric(mock_model)
assert isinstance(metric, HallucinationMetric)

def test_make_hallucination_metric_threshold(self, mock_model):
metric = make_hallucination_metric(mock_model)
assert metric.threshold == HALLUCINATION_THRESHOLD

def test_make_hallucination_metric_includes_reason(self, mock_model):
metric = make_hallucination_metric(mock_model)
assert metric.include_reason is True

def test_make_contextual_relevancy_metric_raises_not_implemented(self, mock_model):
"""ContextualRelevancyMetric is disabled due to protobuf incompatibility."""
with pytest.raises(NotImplementedError, match="protobuf incompatability"):
make_contextual_relevancy_metric(mock_model)

def test_make_faithfulness_metric_raises_not_implemented(self, mock_model):
"""FaithfulnessMetric is disabled due to protobuf incompatibility."""
with pytest.raises(NotImplementedError, match="protobuf incompatability"):
make_faithfulness_metric(mock_model)


class TestContentMetrics:
"""Tests for content-based evaluation metrics."""

def test_make_answer_relevancy_metric_returns_correct_type(self, mock_model):
metric = make_answer_relevancy_metric(mock_model)
assert isinstance(metric, AnswerRelevancyMetric)

def test_make_answer_relevancy_metric_threshold(self, mock_model):
metric = make_answer_relevancy_metric(mock_model)
assert metric.threshold == ANSRELEVANCY_THRESHOLD

def test_make_answer_relevancy_metric_includes_reason(self, mock_model):
metric = make_answer_relevancy_metric(mock_model)
assert metric.include_reason is True

def test_make_bias_metric_returns_correct_type(self, mock_model):
metric = make_bias_metric(mock_model)
assert isinstance(metric, BiasMetric)

def test_make_bias_metric_threshold(self, mock_model):
metric = make_bias_metric(mock_model)
assert metric.threshold == BIAS_THRESHOLD

def test_make_bias_metric_includes_reason(self, mock_model):
metric = make_bias_metric(mock_model)
assert metric.include_reason is True

def test_make_toxicity_metric_returns_correct_type(self, mock_model):
metric = make_toxicity_metric(mock_model)
assert isinstance(metric, ToxicityMetric)

def test_make_toxicity_metric_threshold(self, mock_model):
metric = make_toxicity_metric(mock_model)
assert metric.threshold == TOXICITY_THRESHOLD

def test_make_toxicity_metric_includes_reason(self, mock_model):
metric = make_toxicity_metric(mock_model)
assert metric.include_reason is True


class TestGEvalMetrics:
"""Tests for GEval custom LLM-based metrics."""

def test_make_correctness_metric_returns_geval(self, mock_model):
metric = make_correctness_metric(mock_model)
assert isinstance(metric, GEval)

def test_make_correctness_metric_name(self, mock_model):
metric = make_correctness_metric(mock_model)
assert metric.name == "Correctness"

def test_make_correctness_metric_has_evaluation_steps(self, mock_model):
metric = make_correctness_metric(mock_model)
assert metric.evaluation_steps is not None
assert len(metric.evaluation_steps) > 0

def test_make_correctness_metric_has_criteria(self, mock_model):
metric = make_correctness_metric(mock_model)
assert metric.criteria is not None
assert "factually correct" in metric.criteria


class TestThresholdValues:
"""Tests to verify threshold constants are within valid range."""

def test_precision_threshold_valid(self):
assert 0.0 <= PRECISION_THRESHOLD <= 1.0

def test_recall_threshold_valid(self):
assert 0.0 <= RECALL_THRESHOLD <= 1.0

def test_hallucination_threshold_valid(self):
assert 0.0 <= HALLUCINATION_THRESHOLD <= 1.0

def test_answer_relevancy_threshold_valid(self):
assert 0.0 <= ANSRELEVANCY_THRESHOLD <= 1.0

def test_bias_threshold_valid(self):
assert 0.0 <= BIAS_THRESHOLD <= 1.0

def test_toxicity_threshold_valid(self):
assert 0.0 <= TOXICITY_THRESHOLD <= 1.0