33import time
44
55import gensim
6+ import matplotlib .pyplot as plt
67import numpy as np
78import numpy .linalg
89from nltk .corpus import stopwords
910from nltk import word_tokenize
1011
12+
1113def evaluation_function (response , answer , params ):
1214 """
1315 Function used to evaluate a student response.
@@ -27,61 +29,77 @@ def evaluation_function(response, answer, params):
2729 available on pip (provided it is added to requirements.txt).
2830
2931 The way you wish to structure you code (all in this function, or
30- split into many) is entirely up to you. All that matters are the
32+ split into many) is entirely up to you. All that matters are the
3133 return types and that evaluation_function() is the main function used
3234 to output the evaluation response.
3335 """
34-
35- w2v_similarity = sentence_similarity_mean_w2v (response , answer )
36-
37- # if params is not None and "keywords" in params:
38- # keywords = params["keywords"]
39- # for keyword in keywords:
40- # for resp_score in response_scores:
41- # if resp_score[1] == keyword:
42- # continue
43- # return {
44- # "is_correct": False,
45- # "result": {
46- # "similarity_value": similarity,
47- # "Problematic_word": keyword
48- # },
49- # "feedback": f"Cannot determine if the answer is correct. Please provide more details about '{keyword}"
50- # }
51-
52- # params of the form {'keyphrase': ['phrase1', 'phrase2', ...]}
53- if params is not None and "keyphrases" in params :
54- keyphrases = params ["keyphrases" ]
55- for keyphrase in keyphrases :
56- response_tokens = preprocess_tokens (response )
57- keyphrase_tokens = preprocess_tokens (keyphrase )
58- window_size = len (keyphrase_tokens )
36+ start_time = time .process_time ()
37+
38+ # params of the form {'keystrings': ['keystring1', 'keystring2', ...]}
39+ # keystring of the form {'string':..., 'exact_match:False', 'should_contain:True', 'custom_feedback:None}
40+ if params is not None and "keystrings" in params :
41+ keystrings = params ["keystrings" ]
42+ problematic_keystring = None
43+ keystring_scores = []
44+ response_tokens = preprocess_tokens (response )
45+ for keystring_object in keystrings :
46+ # Unpack keystring object
47+ keystring = keystring_object ['string' ]
48+ exact_match = keystring_object ['exact_match' ] if 'exact_match' in keystring_object else False
49+ should_contain = keystring_object ['should_contain' ] if 'should_contain' in keystring_object else True
50+ custom_feedback = keystring_object ['custom_feedback' ] if 'custom_feedback' in keystring_object else None
51+ keystring_tokens = preprocess_tokens (keystring )
52+
53+ # Sliding window matching
54+ window_size = len (keystring_tokens )
5955 i = 0
60- found = False
56+ max_score = 0
6157 while i + window_size <= len (response_tokens ):
62- response_substring = " " .join (response_tokens [i :i + window_size ])
63- score = sentence_similarity_mean_w2v (response_substring , keyphrase )
58+ response_substring = " " .join (response_tokens [i :i + window_size ])
59+ score1 = sentence_similarity_mean_w2v (response_substring , keystring )
60+ score2 , _ , _ = sentence_similarity (response_substring , keystring )
61+ max_score = max (score1 , score2 , max_score )
6462 i += 1
65- if score > 0.75 :
66- found = True
67- continue
68- if not found :
69- return {
70- "is_correct" : False ,
71- "result" : {
72- "similarity_value" : w2v_similarity ,
73- "Problematic_word" : keyphrase
74- },
75- "feedback" : f"Cannot determine if the answer is correct. Could not identify '{ keyphrase } "
76- }
63+ keystring_scores .append ((keystring , max_score ))
64+
65+ threshold = 0.75
66+ if exact_match is True :
67+ threshold = 0.99
68+
69+ if should_contain is True and max_score < threshold and problematic_keystring is None :
70+ problematic_keystring = keystring
71+ feedback = f"Cannot determine if the answer is correct. Please provide more information about '{ problematic_keystring } '"
72+
73+ if should_contain is False and max_score > threshold and problematic_keystring is None :
74+ problematic_keystring = keystring
75+ feedback = f"Cannot determine if the answer is correct. Identified '{ problematic_keystring } ' in the answer, which was not expected."
76+
77+ if custom_feedback is not None :
78+ feedback = f"Cannot determine if the answer is correct. { custom_feedback } "
79+
80+ if problematic_keystring is not None :
81+ return {
82+ "is_correct" : False ,
83+ "result" : {
84+ "response" : response ,
85+ "processing_time" : time .process_time () - start_time ,
86+ "keystring-scores" : keystring_scores
87+ },
88+ "feedback" : feedback
89+ }
90+
91+ w2v_similarity = sentence_similarity_mean_w2v (response , answer )
7792
7893 if w2v_similarity > 0.75 :
7994 return {
8095 "is_correct" : True ,
8196 "result" : {
97+ "response" : response ,
98+ "processing_time" : time .process_time () - start_time ,
99+ "method" : "w2v" ,
82100 "similarity_value" : w2v_similarity
83101 },
84- "feedback" : "Correct! "
102+ "feedback" : f"Confidence: { w2v_similarity } % "
85103 }
86104
87105 else :
@@ -96,10 +114,14 @@ def evaluation_function(response, answer, params):
96114 return {
97115 "is_correct" : False ,
98116 "result" : {
117+ "response" : response ,
118+ "processing_time" : time .process_time () - start_time ,
119+ "method" : "BOW vector similarity" ,
99120 "similarity_value" : w2v_similarity ,
100- "Problematic_word" : word
121+ "BOW_similarity_value" : similarity ,
122+ "problematic_word" : word
101123 },
102- "feedback" : f"Cannot determine if the answer is correct. Please provide more details about ' { word } "
124+ "feedback" : f"Cannot determine if the answer is correct ( { w2v_similarity } % similarity). { f' Please provide more information about { word } ' if word is not None else '' } "
103125 }
104126
105127
@@ -147,7 +169,9 @@ def sencence_scores(common_words, sentence):
147169 best_similarity = similarity
148170 best_word = other_word
149171 scores .append (
150- (best_similarity * word_information_content (word , blen , freqs ) * word_information_content (best_word , blen , freqs ), word ))
172+ (best_similarity * word_information_content (word , blen , freqs ) * word_information_content (best_word ,
173+ blen , freqs ),
174+ word ))
151175 return scores
152176
153177 response_scores = sencence_scores (all_words , response_words )
@@ -168,6 +192,7 @@ def preprocess_tokens(text: str):
168192 tokens = [word for word in word_tokenize (text ) if word not in to_remove ]
169193 return tokens
170194
195+
171196def sentence_similarity_mean_w2v (response : str , answer : str ):
172197 with open ('w2v' , 'rb' ) as fp :
173198 w2v = pickle .load (fp )
@@ -179,12 +204,18 @@ def sentence_similarity_mean_w2v(response: str, answer: str):
179204 return 0
180205 response_vector = np .mean (response_embeddings , axis = 0 )
181206 answer_vector = np .mean (answer_embeddings , axis = 0 )
182- return float (np .dot (response_vector , answer_vector ) / (np .linalg .norm (response_vector ) * np .linalg .norm (answer_vector )))
207+ return float (
208+ np .dot (response_vector , answer_vector ) / (np .linalg .norm (response_vector ) * np .linalg .norm (answer_vector )))
183209 # TODO
184210
211+
185212if __name__ == "__main__" :
186213 pass
187- # print(time.process_time())
188- # print(evaluation_function("density, velocity,Visc", "Density, Velocity, Viscosity, Length", {'keyphrases': ['Density', 'Velocity', 'Viscosity', 'Length']}))
189- # print(evaluation_function("test", "test", None))
190- # print(time.process_time())
214+ print (evaluation_function ("Density, speed, Viscosity, Length" , "Density, Velocity, Viscosity, Length" , {'keystrings' : [{"string" : "density" }, {"string" : "velocity" , "exact_match" : False , 'should_contain' : False }, {"string" : "viscosity" }, {"string" : "length" }]}))
215+
216+ # File sizes / Location / Permissions
217+ # Clear everything including nltk. Test with small files.
218+ #
219+ # Confidence score for evaluations of answers, grouped by 'correct'/'incorrect' answers
220+ #
221+
0 commit comments