Skip to content

Commit b6c614a

Browse files
committed
Add negative examples, synonym toggle, custom feedback; Run docker tests
1 parent 71df346 commit b6c614a

File tree

3 files changed

+96
-57
lines changed

3 files changed

+96
-57
lines changed

app/evaluation.py

Lines changed: 81 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
import time
44

55
import gensim
6+
import matplotlib.pyplot as plt
67
import numpy as np
78
import numpy.linalg
89
from nltk.corpus import stopwords
910
from nltk import word_tokenize
1011

12+
1113
def evaluation_function(response, answer, params):
1214
"""
1315
Function used to evaluate a student response.
@@ -27,61 +29,77 @@ def evaluation_function(response, answer, params):
2729
available on pip (provided it is added to requirements.txt).
2830
2931
The way you wish to structure you code (all in this function, or
30-
split into many) is entirely up to you. All that matters are the
32+
split into many) is entirely up to you. All that matters are the
3133
return types and that evaluation_function() is the main function used
3234
to output the evaluation response.
3335
"""
34-
35-
w2v_similarity = sentence_similarity_mean_w2v(response, answer)
36-
37-
# if params is not None and "keywords" in params:
38-
# keywords = params["keywords"]
39-
# for keyword in keywords:
40-
# for resp_score in response_scores:
41-
# if resp_score[1] == keyword:
42-
# continue
43-
# return {
44-
# "is_correct": False,
45-
# "result": {
46-
# "similarity_value": similarity,
47-
# "Problematic_word": keyword
48-
# },
49-
# "feedback": f"Cannot determine if the answer is correct. Please provide more details about '{keyword}"
50-
# }
51-
52-
# params of the form {'keyphrase': ['phrase1', 'phrase2', ...]}
53-
if params is not None and "keyphrases" in params:
54-
keyphrases = params["keyphrases"]
55-
for keyphrase in keyphrases:
56-
response_tokens = preprocess_tokens(response)
57-
keyphrase_tokens = preprocess_tokens(keyphrase)
58-
window_size = len(keyphrase_tokens)
36+
start_time = time.process_time()
37+
38+
# params of the form {'keystrings': ['keystring1', 'keystring2', ...]}
39+
# keystring of the form {'string':..., 'exact_match:False', 'should_contain:True', 'custom_feedback:None}
40+
if params is not None and "keystrings" in params:
41+
keystrings = params["keystrings"]
42+
problematic_keystring = None
43+
keystring_scores = []
44+
response_tokens = preprocess_tokens(response)
45+
for keystring_object in keystrings:
46+
# Unpack keystring object
47+
keystring = keystring_object['string']
48+
exact_match = keystring_object['exact_match'] if 'exact_match' in keystring_object else False
49+
should_contain = keystring_object['should_contain'] if 'should_contain' in keystring_object else True
50+
custom_feedback = keystring_object['custom_feedback'] if 'custom_feedback' in keystring_object else None
51+
keystring_tokens = preprocess_tokens(keystring)
52+
53+
# Sliding window matching
54+
window_size = len(keystring_tokens)
5955
i = 0
60-
found = False
56+
max_score = 0
6157
while i + window_size <= len(response_tokens):
62-
response_substring = " ".join(response_tokens[i:i+window_size])
63-
score = sentence_similarity_mean_w2v(response_substring, keyphrase)
58+
response_substring = " ".join(response_tokens[i:i + window_size])
59+
score1 = sentence_similarity_mean_w2v(response_substring, keystring)
60+
score2, _, _ = sentence_similarity(response_substring, keystring)
61+
max_score = max(score1, score2, max_score)
6462
i += 1
65-
if score > 0.75:
66-
found = True
67-
continue
68-
if not found:
69-
return {
70-
"is_correct": False,
71-
"result": {
72-
"similarity_value": w2v_similarity,
73-
"Problematic_word": keyphrase
74-
},
75-
"feedback": f"Cannot determine if the answer is correct. Could not identify '{keyphrase}"
76-
}
63+
keystring_scores.append((keystring, max_score))
64+
65+
threshold = 0.75
66+
if exact_match is True:
67+
threshold = 0.99
68+
69+
if should_contain is True and max_score < threshold and problematic_keystring is None:
70+
problematic_keystring = keystring
71+
feedback = f"Cannot determine if the answer is correct. Please provide more information about '{problematic_keystring}'"
72+
73+
if should_contain is False and max_score > threshold and problematic_keystring is None:
74+
problematic_keystring = keystring
75+
feedback = f"Cannot determine if the answer is correct. Identified '{problematic_keystring}' in the answer, which was not expected."
76+
77+
if custom_feedback is not None:
78+
feedback = f"Cannot determine if the answer is correct. {custom_feedback}"
79+
80+
if problematic_keystring is not None:
81+
return {
82+
"is_correct": False,
83+
"result": {
84+
"response": response,
85+
"processing_time": time.process_time() - start_time,
86+
"keystring-scores": keystring_scores
87+
},
88+
"feedback": feedback
89+
}
90+
91+
w2v_similarity = sentence_similarity_mean_w2v(response, answer)
7792

7893
if w2v_similarity > 0.75:
7994
return {
8095
"is_correct": True,
8196
"result": {
97+
"response": response,
98+
"processing_time": time.process_time() - start_time,
99+
"method": "w2v",
82100
"similarity_value": w2v_similarity
83101
},
84-
"feedback": "Correct!"
102+
"feedback": f"Confidence: {w2v_similarity}%"
85103
}
86104

87105
else:
@@ -96,10 +114,14 @@ def evaluation_function(response, answer, params):
96114
return {
97115
"is_correct": False,
98116
"result": {
117+
"response": response,
118+
"processing_time": time.process_time() - start_time,
119+
"method": "BOW vector similarity",
99120
"similarity_value": w2v_similarity,
100-
"Problematic_word": word
121+
"BOW_similarity_value": similarity,
122+
"problematic_word": word
101123
},
102-
"feedback": f"Cannot determine if the answer is correct. Please provide more details about '{word}"
124+
"feedback": f"Cannot determine if the answer is correct ({w2v_similarity}% similarity). {f'Please provide more information about {word}' if word is not None else ''}"
103125
}
104126

105127

@@ -147,7 +169,9 @@ def sencence_scores(common_words, sentence):
147169
best_similarity = similarity
148170
best_word = other_word
149171
scores.append(
150-
(best_similarity * word_information_content(word, blen, freqs) * word_information_content(best_word, blen, freqs), word))
172+
(best_similarity * word_information_content(word, blen, freqs) * word_information_content(best_word,
173+
blen, freqs),
174+
word))
151175
return scores
152176

153177
response_scores = sencence_scores(all_words, response_words)
@@ -168,6 +192,7 @@ def preprocess_tokens(text: str):
168192
tokens = [word for word in word_tokenize(text) if word not in to_remove]
169193
return tokens
170194

195+
171196
def sentence_similarity_mean_w2v(response: str, answer: str):
172197
with open('w2v', 'rb') as fp:
173198
w2v = pickle.load(fp)
@@ -179,12 +204,18 @@ def sentence_similarity_mean_w2v(response: str, answer: str):
179204
return 0
180205
response_vector = np.mean(response_embeddings, axis=0)
181206
answer_vector = np.mean(answer_embeddings, axis=0)
182-
return float(np.dot(response_vector, answer_vector) / (np.linalg.norm(response_vector) * np.linalg.norm(answer_vector)))
207+
return float(
208+
np.dot(response_vector, answer_vector) / (np.linalg.norm(response_vector) * np.linalg.norm(answer_vector)))
183209
# TODO
184210

211+
185212
if __name__ == "__main__":
186213
pass
187-
# print(time.process_time())
188-
# print(evaluation_function("density, velocity,Visc", "Density, Velocity, Viscosity, Length", {'keyphrases': ['Density', 'Velocity', 'Viscosity', 'Length']}))
189-
# print(evaluation_function("test", "test", None))
190-
# print(time.process_time())
214+
print(evaluation_function("Density, speed, Viscosity, Length", "Density, Velocity, Viscosity, Length", {'keystrings': [{"string": "density"}, {"string": "velocity", "exact_match": False, 'should_contain': False}, {"string": "viscosity"}, {"string": "length"}]}))
215+
216+
# File sizes / Location / Permissions
217+
# Clear everything including nltk. Test with small files.
218+
#
219+
# Confidence score for evaluations of answers, grouped by 'correct'/'incorrect' answers
220+
#
221+

app/evaluation_tests.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,8 @@ def test_reynolds_number_is_incorrect(self):
6161

6262
self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
6363

64-
def test_reynolds_number_is_incorrect_with_keyphrase(self):
65-
answer, params = 'Density, Velocity, Viscosity, Length', {'keyphrases': ['density', 'velocity', 'viscosity', 'length']}
64+
def test_reynolds_number_is_incorrect_with_keystring(self):
65+
answer, params = 'Density, Velocity, Viscosity, Length', {'keystrings': [{'string': 'density'}, {'string': 'velocity'}, {'string': 'viscosity'}, {'string': 'length'}]}
6666
incorrect_responses = [
6767
'density,velocity,visc,',
6868
]
@@ -75,19 +75,26 @@ def test_reynolds_number_is_incorrect_with_keyphrase(self):
7575
navier_stokes_answer = "The density of the film is uniform and constant, therefore the flow is incompressible. " \
7676
"Since we have incompressible flow, uniform viscosity, Newtonian fluid, " \
7777
"the most appropriate set of equations for the solution of the problem is the " \
78-
"Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used."
79-
# TODO: Navier-stokes equations
78+
"Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used: " \
79+
"mass conservation and components of the momentum balance"
80+
81+
navier_stokes_params = {'keystrings': [{'string': 'Navier-Stokes equations'}, {'string': 'mass conservation'},
82+
{'string': 'momentum balance'}, {'string': 'incompressible flow'},
83+
{'string': 'uniform viscosity'}, {'string': 'Newtonian fluid'}]}
8084

8185
def test_navier_stokes_equation(self):
8286
answer, params = self.navier_stokes_answer, dict()
8387
correct_responses = [
8488
#'Navier-stokes. Continuum, const and uniform density and viscosity so incompressible, newtonian. Fits all '
85-
#'requirements for navier stokes'
89+
#'requirements for navier stokes',
90+
'Navier-Stokes in a Cartesian reference coordinates would be chosen for this particular flow. This is due '
91+
'to the reason that the flow is Newtonian, the viscosity is uniform and constant. Additionally, '
92+
'the density is uniform and constant; implying that it is an incompressible flow. This flow obeys the '
93+
'main assumptions in order to employ the Navier Stokes equations.',
8694
]
8795

8896
for response in correct_responses:
8997
result = evaluation_function(response, answer, params)
90-
print(result)
9198
self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
9299

93100
if __name__ == "__main__":

app/requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
numpy
22
nltk
3-
gensim
3+
gensim
4+
matplotlib

0 commit comments

Comments
 (0)