Add negative examples, synonym toggle, custom feedback; Run docker tests

cc1619 · cc1619 · commit b6c614a1bda4 · 2023-05-18T12:02:06.000+01:00
diff --git a/app/evaluation.py b/app/evaluation.py
@@ -3,11 +3,13 @@
 import time
 
 import gensim
+import matplotlib.pyplot as plt
 import numpy as np
 import numpy.linalg
 from nltk.corpus import stopwords
 from nltk import word_tokenize
 
+
 def evaluation_function(response, answer, params):
     """
     Function used to evaluate a student response.
@@ -27,61 +29,77 @@ def evaluation_function(response, answer, params):
     available on pip (provided it is added to requirements.txt).
 
     The way you wish to structure you code (all in this function, or 
-    split into many) is entirely up to you. All that matters are the 
+    split into many) is entirely     up to you. All that matters are the
     return types and that evaluation_function() is the main function used 
     to output the evaluation response.
     """
-
-    w2v_similarity = sentence_similarity_mean_w2v(response, answer)
-
-    # if params is not None and "keywords" in params:
-    #     keywords = params["keywords"]
-    #     for keyword in keywords:
-    #         for resp_score in response_scores:
-    #             if resp_score[1] == keyword:
-    #                 continue
-    #         return {
-    #             "is_correct": False,
-    #             "result": {
-    #                 "similarity_value": similarity,
-    #                 "Problematic_word": keyword
-    #             },
-    #             "feedback": f"Cannot determine if the answer is correct. Please provide more details about '{keyword}"
-    #         }
-
-    # params of the form {'keyphrase': ['phrase1', 'phrase2', ...]}
-    if params is not None and "keyphrases" in params:
-        keyphrases = params["keyphrases"]
-        for keyphrase in keyphrases:
-            response_tokens = preprocess_tokens(response)
-            keyphrase_tokens = preprocess_tokens(keyphrase)
-            window_size = len(keyphrase_tokens)
+    start_time = time.process_time()
+
+    # params of the form {'keystrings': ['keystring1', 'keystring2', ...]}
+    # keystring of the form {'string':..., 'exact_match:False', 'should_contain:True', 'custom_feedback:None}
+    if params is not None and "keystrings" in params:
+        keystrings = params["keystrings"]
+        problematic_keystring = None
+        keystring_scores = []
+        response_tokens = preprocess_tokens(response)
+        for keystring_object in keystrings:
+            # Unpack keystring object
+            keystring = keystring_object['string']
+            exact_match = keystring_object['exact_match'] if 'exact_match' in keystring_object else False
+            should_contain = keystring_object['should_contain'] if 'should_contain' in keystring_object else True
+            custom_feedback = keystring_object['custom_feedback'] if 'custom_feedback' in keystring_object else None
+            keystring_tokens = preprocess_tokens(keystring)
+
+            # Sliding window matching
+            window_size = len(keystring_tokens)
             i = 0
-            found = False
+            max_score = 0
             while i + window_size <= len(response_tokens):
-                response_substring = " ".join(response_tokens[i:i+window_size])
-                score = sentence_similarity_mean_w2v(response_substring, keyphrase)
+                response_substring = " ".join(response_tokens[i:i + window_size])
+                score1 = sentence_similarity_mean_w2v(response_substring, keystring)
+                score2, _, _ = sentence_similarity(response_substring, keystring)
+                max_score = max(score1, score2, max_score)
                 i += 1
-                if score > 0.75:
-                    found = True
-                    continue
-            if not found:
-                return {
-                    "is_correct": False,
-                    "result": {
-                        "similarity_value": w2v_similarity,
-                        "Problematic_word": keyphrase
-                    },
-                    "feedback": f"Cannot determine if the answer is correct. Could not identify '{keyphrase}"
-                }
+            keystring_scores.append((keystring, max_score))
+
+            threshold = 0.75
+            if exact_match is True:
+                threshold = 0.99
+
+            if should_contain is True and max_score < threshold and problematic_keystring is None:
+                problematic_keystring = keystring
+                feedback = f"Cannot determine if the answer is correct. Please provide more information about '{problematic_keystring}'"
+
+            if should_contain is False and max_score > threshold and problematic_keystring is None:
+                problematic_keystring = keystring
+                feedback = f"Cannot determine if the answer is correct. Identified '{problematic_keystring}' in the answer, which was not expected."
+
+            if custom_feedback is not None:
+                feedback = f"Cannot determine if the answer is correct. {custom_feedback}"
+
+        if problematic_keystring is not None:
+            return {
+                "is_correct": False,
+                "result": {
+                    "response": response,
+                    "processing_time": time.process_time() - start_time,
+                    "keystring-scores": keystring_scores
+                },
+                "feedback": feedback
+            }
+
+    w2v_similarity = sentence_similarity_mean_w2v(response, answer)
 
     if w2v_similarity > 0.75:
         return {
             "is_correct": True,
             "result": {
+                "response": response,
+                "processing_time": time.process_time() - start_time,
+                "method": "w2v",
                 "similarity_value": w2v_similarity
             },
-            "feedback": "Correct!"
+            "feedback": f"Confidence: {w2v_similarity}%"
         }
 
     else:
@@ -96,10 +114,14 @@ def evaluation_function(response, answer, params):
         return {
             "is_correct": False,
             "result": {
+                "response": response,
+                "processing_time": time.process_time() - start_time,
+                "method": "BOW vector similarity",
                 "similarity_value": w2v_similarity,
-                "Problematic_word": word
+                "BOW_similarity_value": similarity,
+                "problematic_word": word
             },
-            "feedback": f"Cannot determine if the answer is correct. Please provide more details about '{word}"
+            "feedback": f"Cannot determine if the answer is correct ({w2v_similarity}% similarity). {f'Please provide more information about {word}' if word is not None else ''}"
         }
 
 
@@ -147,7 +169,9 @@ def sencence_scores(common_words, sentence):
                     best_similarity = similarity
                     best_word = other_word
             scores.append(
-                (best_similarity * word_information_content(word, blen, freqs) * word_information_content(best_word, blen, freqs), word))
+                (best_similarity * word_information_content(word, blen, freqs) * word_information_content(best_word,
+                                                                                                          blen, freqs),
+                 word))
         return scores
 
     response_scores = sencence_scores(all_words, response_words)
@@ -168,6 +192,7 @@ def preprocess_tokens(text: str):
     tokens = [word for word in word_tokenize(text) if word not in to_remove]
     return tokens
 
+
 def sentence_similarity_mean_w2v(response: str, answer: str):
     with open('w2v', 'rb') as fp:
         w2v = pickle.load(fp)
@@ -179,12 +204,18 @@ def sentence_similarity_mean_w2v(response: str, answer: str):
         return 0
     response_vector = np.mean(response_embeddings, axis=0)
     answer_vector = np.mean(answer_embeddings, axis=0)
-    return float(np.dot(response_vector, answer_vector) / (np.linalg.norm(response_vector) * np.linalg.norm(answer_vector)))
+    return float(
+        np.dot(response_vector, answer_vector) / (np.linalg.norm(response_vector) * np.linalg.norm(answer_vector)))
     # TODO
 
+
 if __name__ == "__main__":
     pass
-    # print(time.process_time())
-    # print(evaluation_function("density, velocity,Visc", "Density, Velocity, Viscosity, Length", {'keyphrases': ['Density', 'Velocity', 'Viscosity', 'Length']}))
-    # print(evaluation_function("test", "test", None))
-    # print(time.process_time())
+    print(evaluation_function("Density, speed, Viscosity, Length", "Density, Velocity, Viscosity, Length", {'keystrings': [{"string": "density"}, {"string": "velocity", "exact_match": False, 'should_contain': False}, {"string": "viscosity"}, {"string": "length"}]}))
+
+# File sizes / Location / Permissions
+# Clear everything including nltk. Test with small files.
+#
+# Confidence score for evaluations of answers, grouped by 'correct'/'incorrect' answers
+#
+
diff --git a/app/evaluation_tests.py b/app/evaluation_tests.py
@@ -61,8 +61,8 @@ def test_reynolds_number_is_incorrect(self):
 
             self.assertEqual(result.get("is_correct"), False, msg=f'Response: {response}')
 
-    def test_reynolds_number_is_incorrect_with_keyphrase(self):
-        answer, params = 'Density, Velocity, Viscosity, Length', {'keyphrases': ['density', 'velocity', 'viscosity', 'length']}
+    def test_reynolds_number_is_incorrect_with_keystring(self):
+        answer, params = 'Density, Velocity, Viscosity, Length', {'keystrings': [{'string': 'density'}, {'string': 'velocity'}, {'string': 'viscosity'}, {'string': 'length'}]}
         incorrect_responses = [
             'density,velocity,visc,',
         ]
@@ -75,19 +75,26 @@ def test_reynolds_number_is_incorrect_with_keyphrase(self):
     navier_stokes_answer = "The density of the film is uniform and constant, therefore the flow is incompressible. " \
                            "Since we have incompressible flow, uniform viscosity, Newtonian fluid, " \
                            "the most appropriate set of equations for the solution of the problem is the " \
-                           "Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used."
-    # TODO: Navier-stokes equations
+                           "Navier-Stokes equations. The Navier-Stokes equations in Cartesian coordinates are used: " \
+                           "mass conservation and components of the momentum balance"
+
+    navier_stokes_params = {'keystrings': [{'string': 'Navier-Stokes equations'}, {'string': 'mass conservation'},
+                                                                    {'string': 'momentum balance'}, {'string': 'incompressible flow'},
+                                                                    {'string': 'uniform viscosity'}, {'string': 'Newtonian fluid'}]}
 
     def test_navier_stokes_equation(self):
         answer, params = self.navier_stokes_answer, dict()
         correct_responses = [
             #'Navier-stokes. Continuum, const and uniform density and viscosity so incompressible, newtonian. Fits all '
-            #'requirements for navier stokes'
+            #'requirements for navier stokes',
+            'Navier-Stokes in a Cartesian reference coordinates would be chosen for this particular flow. This is due '
+            'to the reason that the flow is Newtonian, the viscosity is uniform and constant. Additionally, '
+            'the density is uniform and constant; implying that it is an incompressible flow. This flow obeys the '
+            'main assumptions in order to employ the Navier Stokes equations.',
         ]
 
         for response in correct_responses:
             result = evaluation_function(response, answer, params)
-            print(result)
             self.assertEqual(result.get("is_correct"), True, msg=f'Response: {response}')
 
 if __name__ == "__main__":
diff --git a/app/requirements.txt b/app/requirements.txt
@@ -1,3 +1,4 @@
 numpy
 nltk
-gensim
+gensim
+matplotlib

-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 numpy
 nltk
 -gensim
 +gensim
 +matplotlib