11import unittest
2-
3- try :
4- from .evaluation import evaluation_function
5- except ImportError :
6- from evaluation import evaluation_function
2+ from evaluation import evaluation_function , Config
73
84class TestEvaluationFunction (unittest .TestCase ):
95 """
10- TestCase Class used to test the algorithm.
11- ---
12- Tests are used here to check that the algorithm written
13- is working as it should.
14-
15- It's best practise to write these tests first to get a
16- kind of 'specification' for how your algorithm should
17- work, and you should run these tests before committing
18- your code to AWS.
19-
20- Read the docs on how to use unittest here:
21- https://docs.python.org/3/library/unittest.html
22-
23- Use evaluation_function() to check your algorithm works
24- as it should.
6+ TestCase Class to test the evaluation function.
7+ ---
8+ This test suite validates the correctness of `evaluation_function()` by checking:
9+ - Whether it correctly identifies correct and incorrect responses.
10+ - Whether it can handle keystring constraints.
11+ - Whether it can enforce exact matches.
12+ - Whether it processes responses efficiently and correctly.
13+
14+ The function is evaluated using an LLM-based semantic comparison.
2515 """
26- def test_returns_is_correct_true (self ):
27- response , answer , params = "A xor gate takes 2 inputs" , "There are 2 inputs in a xor gate" , dict ()
28- result = evaluation_function (response , answer , params )
29-
30- self .assertEqual (result .get ("is_correct" ), True )
31-
32- def test_reynolds_number_is_correct (self ):
33- answer , params = 'Density, Velocity, Viscosity, Length' , dict ()
34- correct_responses = [
35- 'density,velocity,viscosity,length' ,
36- 'Density,Velocity,Viscosity,Length' ,
37- 'density,characteristic velocity,viscosity,characteristic length' ,
38- 'Density,Velocity,Shear viscosity,Length' ,
39- 'density,velocity,viscosity,lengthscale' ,
40- 'density,velocity,shear viscosity,length' ,
41- 'density,characteristic velocity,shear viscosity,characteristic lengthscale' ,
42- 'density,velocity,shear viscosity,characteristic lengthscale' ,
43- 'density,velocity,viscosity,length scale' ,
44- 'pressure,characteristic velocity of flow,shear viscosity,characteristic length scale' ,
45- ]
46-
47- for response in correct_responses :
48- result = evaluation_function (response , answer , params )
49-
50- self .assertEqual (result .get ("is_correct" ), True , msg = f'Response: { response } ' )
51-
52- def test_reynolds_number_is_incorrect (self ):
53- answer , params = 'Density, Velocity, Viscosity, Length' , dict ()
54- incorrect_responses = [
55- 'density,,,' ,
56- 'rho,u,mu,L' ,
57- ]
58-
59- for response in incorrect_responses :
60- result = evaluation_function (response , answer , params )
61-
62- self .assertEqual (result .get ("is_correct" ), False , msg = f'Response: { response } ' )
63-
64- def test_reynolds_number_is_incorrect_with_keystring (self ):
65- answer , params = 'Density, Velocity, Viscosity, Length' , {'keystrings' : [{'string' : 'density' }, {'string' : 'velocity' }, {'string' : 'viscosity' }, {'string' : 'length' }]}
66- incorrect_responses = [
67- 'density,velocity,visc,' ,
68- ]
69-
70- for response in incorrect_responses :
71- result = evaluation_function (response , answer , params )
72-
73- self .assertEqual (result .get ("is_correct" ), False , msg = f'Response: { response } ' )
74-
75- def test_reynolds_number_exact_match (self ):
76- answer , params = 'Density, Velocity, Viscosity, Length' , {
77- 'keystrings' : [{'string' : 'velocity' , 'exact_match' : True }]}
78- incorrect_responses = [
79- 'density,speed,viscosity, length' ,
80- ]
81-
82- for response in incorrect_responses :
83- result = evaluation_function (response , answer , params )
84-
85- self .assertEqual (result .get ("is_correct" ), False , msg = f'Response: { response } ' )
86-
87- def test_reynolds_number_should_not_contain (self ):
88- answer , params = 'Density, Velocity, Viscosity, Length' , {
89- 'keystrings' : [{'string' : 'direction' , 'should_contain' : False }]}
90- incorrect_responses = [
91- 'density,speed,viscosity, length, direction' ,
92- ]
93-
94- for response in incorrect_responses :
95- result = evaluation_function (response , answer , params )
96-
97- self .assertEqual (result .get ("is_correct" ), False , msg = f'Response: { response } ' )
98-
99- def test_reynolds_number_custom_feedback (self ):
100- answer , params = 'Density, Velocity, Viscosity, Length' , {
101- 'keystrings' : [{'string' : 'banana' , 'custom_feedback' : 'custom feedback with the word banana' }]}
102- incorrect_responses = [
103- 'An incorrect response' ,
104- ]
105-
106- for response in incorrect_responses :
107- result = evaluation_function (response , answer , params )
108-
109- self .assertIn ('banana' , result .get ("feedback" ), msg = f'Response: { response } ' )
110-
111- navier_stokes_answer = "The density of the film is uniform and constant, therefore the flow is incompressible. " \
112- "Since we have incompressible flow, uniform viscosity, Newtonian fluid, " \
113- "the most appropriate set of equations for the solution of the problem is the " \
114- "Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used: " \
115- "mass conservation and components of the momentum balance"
116-
117- navier_stokes_params = {'keystrings' : [{'string' : 'Navier-Stokes equations' }, {'string' : 'mass conservation' },
118- {'string' : 'momentum balance' }, {'string' : 'incompressible flow' },
119- {'string' : 'uniform viscosity' }, {'string' : 'Newtonian fluid' }]}
120-
121- def test_navier_stokes_equation (self ):
122- answer , params = self .navier_stokes_answer , dict ()
123- correct_responses = [
124- #'Navier-stokes. Continuum, const and uniform density and viscosity so incompressible, newtonian. Fits all '
125- #'requirements for navier stokes',
126- 'Navier-Stokes in a Cartesian reference coordinates would be chosen for this particular flow. This is due '
127- 'to the reason that the flow is Newtonian, the viscosity is uniform and constant. Additionally, '
128- 'the density is uniform and constant; implying that it is an incompressible flow. This flow obeys the '
129- 'main assumptions in order to employ the Navier Stokes equations.' ,
130- ]
131-
132- for response in correct_responses :
133- result = evaluation_function (response , answer , params )
134- self .assertEqual (result .get ("is_correct" ), True , msg = f'Response: { response } ' )
135-
136- def test_negation (self ):
137- answer , params = 'light blue' , dict ()
138- correct_responses = [
139- 'bright blue' ,
140- 'light blue' ,
141- 'not light blue' , # WARNING: THIS test should be False, but the similarity algorithm cannot handle negations
142- 'dark blue' # WARNING: THIS test should be False, but the similarity algorithm cannot handle context understanding
143- ]
144-
145- for response in correct_responses :
146- result = evaluation_function (response , answer , params )
147- self .assertEqual (result .get ("is_correct" ), True , msg = f'Response: { response } ' )
16+
17+ @classmethod
18+ def setUpClass (cls ):
19+ """Initialize a shared Config instance for LLM setup."""
20+ cls .config = Config ()
21+
22+ def test_basic_correct_response (self ):
23+ """Test if semantically similar responses are marked correct."""
24+ response = ["Density" , "Velocity" , "Viscosity" , "Length" ]
25+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
26+ result = evaluation_function (response , answer , self .config )
27+
28+ self .assertTrue (result .get ("is_correct" ))
29+
30+ def test_basic_incorrect_response (self ):
31+ """Test if semantically different responses are marked incorrect."""
32+ response = ["Mass" , "Speed" , "Friction" , "Force" ]
33+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
34+ result = evaluation_function (response , answer , self .config )
35+
36+ self .assertFalse (result .get ("is_correct" ))
37+
38+ def test_partial_match (self ):
39+ """Test if a response too short is marked incorrect."""
40+ response = ["Density" , "Velocity" , "Viscosity" ]
41+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
42+
43+ self .config .response_num_required = 4
44+ result = evaluation_function (response , answer , self .config )
45+ self .assertFalse (result .get ("is_correct" ))
46+
47+
48+ def test_synonyms_match (self ):
49+ """Test if abbriviations are correctly identified."""
50+ response = ['speed' ]
51+ answer = ['velocity' ]
52+ result = evaluation_function (response , answer , self .config )
53+
54+ self .assertTrue (result .get ("is_correct" ))
55+
56+ def test_exact_match_requirement (self ):
57+ """Test enforcing exact match on keystrings."""
58+ response = ["density" , "speed" , "viscosity" , "length" ]
59+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
60+
61+ result = evaluation_function (response , answer , self .config )
62+ self .assertTrue (result .get ("is_correct" ))
63+
64+ def test_should_not_contain (self ):
65+ """Test if a response with a prohibited keyword fails."""
66+ response = ["density" , "velocity" , "viscosity" , "length" , "direction" ]
67+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
68+
69+ result = evaluation_function (response , answer , self .config )
70+ self .assertFalse (result .get ("is_correct" ))
71+
72+
73+ def test_negation_handling (self ):
74+ """Test how the model handles negation."""
75+ response = ["not light blue" , "dark blue" ]
76+ answer = ["light blue" ]
77+
78+ result = evaluation_function (response , answer , self .config )
79+
80+ self .assertFalse (result .get ("is_correct" ))
81+
82+ def test_performance (self ):
83+ """Ensure that processing time is reasonable."""
84+ response = ["Density" , "Velocity" , "Viscosity" , "Length" ]
85+ answer = ["Density" , "Velocity" , "Viscosity" , "Length" ]
86+
87+ result = evaluation_function (response , answer , self .config )
88+ processing_time = result .get ("result" , {}).get ("processing_time" , 0 )
89+
90+ self .assertLess (processing_time , 5 , msg = "Evaluation function should run efficiently." )
14891
14992if __name__ == "__main__" :
15093 unittest .main ()
0 commit comments