Skip to content

Commit 311b07d

Browse files
committed
tested OK function
1 parent 1208231 commit 311b07d

File tree

3 files changed

+108
-239
lines changed

3 files changed

+108
-239
lines changed

app/evaluation.py

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,17 +93,36 @@ def evaluation_function(response, answer, config=None):
9393
- "True" if the words are semantically the same.
9494
- "False" if the words are semantically different.
9595
96+
9697
### Examples:
97-
Word1: "happy", Word2: "happy"
98+
Word1: "velocity", Word2: "speed"
99+
Response: True
100+
101+
Word1: "Pressure", Word2: "pressure"
102+
Response: True
103+
104+
Word1: "molecule", Word2: "molecules"
105+
Response: True
106+
107+
Word1: "math function", Word2: "math formulae"
98108
Response: True
99109
100-
Word1: "happy", Word2: "joyful"
110+
Word1: "photosynthesis", Word2: "plant energy conversion"
101111
Response: True
102112
103-
Word1: "cat", Word 2: "dog"
113+
Word1: "neuron", Word2: "planet"
114+
Response: False
115+
116+
Word1: "gravity", Word2: "voltage"
117+
Response: False
118+
119+
Word1: "robotic", Word2: "not robotic"
120+
Response: False
121+
122+
Word1: "molecular", Word2: "atomic"
104123
Response: False
105124
106-
Word1: "bank", Word 2: "actor"
125+
Word1: "dark blue", Word2: "light blue"
107126
Response: False
108127
109128
### Input:

app/evaluation_tests.py

Lines changed: 85 additions & 142 deletions
Original file line numberDiff line numberDiff line change
@@ -1,150 +1,93 @@
11
import unittest
2-
3-
try:
4-
from .evaluation import evaluation_function
5-
except ImportError:
6-
from evaluation import evaluation_function
2+
from evaluation import evaluation_function, Config
73

84
class TestEvaluationFunction(unittest.TestCase):
95
"""
10-
TestCase Class used to test the algorithm.
11-
---
12-
Tests are used here to check that the algorithm written
13-
is working as it should.
14-
15-
It's best practise to write these tests first to get a
16-
kind of 'specification' for how your algorithm should
17-
work, and you should run these tests before committing
18-
your code to AWS.
19-
20-
Read the docs on how to use unittest here:
21-
https://docs.python.org/3/library/unittest.html
22-
23-
Use evaluation_function() to check your algorithm works
24-
as it should.
6+
TestCase Class to test the evaluation function.
7+
---
8+
This test suite validates the correctness of `evaluation_function()` by checking:
9+
- Whether it correctly identifies correct and incorrect responses.
10+
- Whether it can handle keystring constraints.
11+
- Whether it can enforce exact matches.
12+
- Whether it processes responses efficiently and correctly.
13+
14+
The function is evaluated using an LLM-based semantic comparison.
2515
"""
26-
def test_returns_is_correct_true(self):
27-
response, answer, params = "A xor gate takes 2 inputs", "There are 2 inputs in a xor gate", dict()
28-
result = evaluation_function(response, answer, params)
29-
30-
self.assertEqual(result.get("is_correct"), True)
31-
32-
def test_reynolds_number_is_correct(self):
33-
answer, params = 'Density, Velocity, Viscosity, Length', dict()
34-
correct_responses = [
35-
'density,velocity,viscosity,length',
36-
'Density,Velocity,Viscosity,Length',
37-
'density,characteristic velocity,viscosity,characteristic length',
38-
'Density,Velocity,Shear viscosity,Length',
39-
'density,velocity,viscosity,lengthscale',
40-
'density,velocity,shear viscosity,length',
41-
'density,characteristic velocity,shear viscosity,characteristic lengthscale',
42-
'density,velocity,shear viscosity,characteristic lengthscale',
43-
'density,velocity,viscosity,length scale',
44-
'pressure,characteristic velocity of flow,shear viscosity,characteristic length scale',
45-
]
46-
47-
for response in correct_responses:
48-
result = evaluation_function(response, answer, params)
49-
50-
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
51-
52-
def test_reynolds_number_is_incorrect(self):
53-
answer, params = 'Density, Velocity, Viscosity, Length', dict()
54-
incorrect_responses = [
55-
'density,,,',
56-
'rho,u,mu,L',
57-
]
58-
59-
for response in incorrect_responses:
60-
result = evaluation_function(response, answer, params)
61-
62-
self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
63-
64-
def test_reynolds_number_is_incorrect_with_keystring(self):
65-
answer, params = 'Density, Velocity, Viscosity, Length', {'keystrings': [{'string': 'density'}, {'string': 'velocity'}, {'string': 'viscosity'}, {'string': 'length'}]}
66-
incorrect_responses = [
67-
'density,velocity,visc,',
68-
]
69-
70-
for response in incorrect_responses:
71-
result = evaluation_function(response, answer, params)
72-
73-
self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
74-
75-
def test_reynolds_number_exact_match(self):
76-
answer, params = 'Density, Velocity, Viscosity, Length', {
77-
'keystrings': [{'string': 'velocity', 'exact_match': True}]}
78-
incorrect_responses = [
79-
'density,speed,viscosity, length',
80-
]
81-
82-
for response in incorrect_responses:
83-
result = evaluation_function(response, answer, params)
84-
85-
self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
86-
87-
def test_reynolds_number_should_not_contain(self):
88-
answer, params = 'Density, Velocity, Viscosity, Length', {
89-
'keystrings': [{'string': 'direction', 'should_contain': False}]}
90-
incorrect_responses = [
91-
'density,speed,viscosity, length, direction',
92-
]
93-
94-
for response in incorrect_responses:
95-
result = evaluation_function(response, answer, params)
96-
97-
self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
98-
99-
def test_reynolds_number_custom_feedback(self):
100-
answer, params = 'Density, Velocity, Viscosity, Length', {
101-
'keystrings': [{'string': 'banana', 'custom_feedback': 'custom feedback with the word banana'}]}
102-
incorrect_responses = [
103-
'An incorrect response',
104-
]
105-
106-
for response in incorrect_responses:
107-
result = evaluation_function(response, answer, params)
108-
109-
self.assertIn('banana', result.get("feedback"), msg=f'Response: {response}')
110-
111-
navier_stokes_answer = "The density of the film is uniform and constant, therefore the flow is incompressible. " \
112-
"Since we have incompressible flow, uniform viscosity, Newtonian fluid, " \
113-
"the most appropriate set of equations for the solution of the problem is the " \
114-
"Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used: " \
115-
"mass conservation and components of the momentum balance"
116-
117-
navier_stokes_params = {'keystrings': [{'string': 'Navier-Stokes equations'}, {'string': 'mass conservation'},
118-
{'string': 'momentum balance'}, {'string': 'incompressible flow'},
119-
{'string': 'uniform viscosity'}, {'string': 'Newtonian fluid'}]}
120-
121-
def test_navier_stokes_equation(self):
122-
answer, params = self.navier_stokes_answer, dict()
123-
correct_responses = [
124-
#'Navier-stokes. Continuum, const and uniform density and viscosity so incompressible, newtonian. Fits all '
125-
#'requirements for navier stokes',
126-
'Navier-Stokes in a Cartesian reference coordinates would be chosen for this particular flow. This is due '
127-
'to the reason that the flow is Newtonian, the viscosity is uniform and constant. Additionally, '
128-
'the density is uniform and constant; implying that it is an incompressible flow. This flow obeys the '
129-
'main assumptions in order to employ the Navier Stokes equations.',
130-
]
131-
132-
for response in correct_responses:
133-
result = evaluation_function(response, answer, params)
134-
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
135-
136-
def test_negation(self):
137-
answer, params = 'light blue', dict()
138-
correct_responses = [
139-
'bright blue',
140-
'light blue',
141-
'not light blue', # WARNING: THIS test should be False, but the similarity algorithm cannot handle negations
142-
'dark blue' # WARNING: THIS test should be False, but the similarity algorithm cannot handle context understanding
143-
]
144-
145-
for response in correct_responses:
146-
result = evaluation_function(response, answer, params)
147-
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
16+
17+
@classmethod
18+
def setUpClass(cls):
19+
"""Initialize a shared Config instance for LLM setup."""
20+
cls.config = Config()
21+
22+
def test_basic_correct_response(self):
23+
"""Test if semantically similar responses are marked correct."""
24+
response = ["Density", "Velocity", "Viscosity", "Length"]
25+
answer = ["Density", "Velocity", "Viscosity", "Length"]
26+
result = evaluation_function(response, answer, self.config)
27+
28+
self.assertTrue(result.get("is_correct"))
29+
30+
def test_basic_incorrect_response(self):
31+
"""Test if semantically different responses are marked incorrect."""
32+
response = ["Mass", "Speed", "Friction", "Force"]
33+
answer = ["Density", "Velocity", "Viscosity", "Length"]
34+
result = evaluation_function(response, answer, self.config)
35+
36+
self.assertFalse(result.get("is_correct"))
37+
38+
def test_partial_match(self):
39+
"""Test if a response too short is marked incorrect."""
40+
response = ["Density", "Velocity", "Viscosity"]
41+
answer = ["Density", "Velocity", "Viscosity", "Length"]
42+
43+
self.config.response_num_required = 4
44+
result = evaluation_function(response, answer, self.config)
45+
self.assertFalse(result.get("is_correct"))
46+
47+
48+
def test_synonyms_match(self):
49+
"""Test if abbriviations are correctly identified."""
50+
response = ['speed']
51+
answer = ['velocity']
52+
result = evaluation_function(response, answer, self.config)
53+
54+
self.assertTrue(result.get("is_correct"))
55+
56+
def test_exact_match_requirement(self):
57+
"""Test enforcing exact match on keystrings."""
58+
response = ["density", "speed", "viscosity", "length"]
59+
answer = ["Density", "Velocity", "Viscosity", "Length"]
60+
61+
result = evaluation_function(response, answer, self.config)
62+
self.assertTrue(result.get("is_correct"))
63+
64+
def test_should_not_contain(self):
65+
"""Test if a response with a prohibited keyword fails."""
66+
response = ["density", "velocity", "viscosity", "length", "direction"]
67+
answer = ["Density", "Velocity", "Viscosity", "Length"]
68+
69+
result = evaluation_function(response, answer, self.config)
70+
self.assertFalse(result.get("is_correct"))
71+
72+
73+
def test_negation_handling(self):
74+
"""Test how the model handles negation."""
75+
response = ["not light blue", "dark blue"]
76+
answer = ["light blue"]
77+
78+
result = evaluation_function(response, answer, self.config)
79+
80+
self.assertFalse(result.get("is_correct"))
81+
82+
def test_performance(self):
83+
"""Ensure that processing time is reasonable."""
84+
response = ["Density", "Velocity", "Viscosity", "Length"]
85+
answer = ["Density", "Velocity", "Viscosity", "Length"]
86+
87+
result = evaluation_function(response, answer, self.config)
88+
processing_time = result.get("result", {}).get("processing_time", 0)
89+
90+
self.assertLess(processing_time, 5, msg="Evaluation function should run efficiently.")
14891

14992
if __name__ == "__main__":
15093
unittest.main()

app/fitted_evaluation_test.py

Lines changed: 0 additions & 93 deletions
This file was deleted.

0 commit comments

Comments
 (0)