Merge pull request #99 from VivekSinghDS/add/test-case

benjaminye · web-flow · commit 5d3fa4919f5e · 2024-04-02T08:59:16.000-04:00
updated changes
diff --git a/config.yml b/config.yml
@@ -69,4 +69,15 @@ inference:
   use_cache: True
   do_sample: True
   top_p: 0.9
-  temperature: 0.8
+  temperature: 0.8
+
+qa:
+  llm_tests:
+    - jaccard_similarity
+    - dot_product
+    - rouge_score
+    - word_overlap
+    - verb_percent
+    - adjective_percent
+    - noun_percent
+    - summary_length
diff --git a/src/pydantic_models/config_model.py b/src/pydantic_models/config_model.py
@@ -8,6 +8,9 @@
 # TODO: Refactor this into multiple files...
 HfModelPath = str
 
+class QaConfig(BaseModel):
+    llm_tests: Optional[List[str]] = Field([], description = "list of tests that needs to be connected")
+    
 
 class DataConfig(BaseModel):
     file_type: Literal["json", "csv", "huggingface"] = Field(
diff --git a/src/qa/qa.py b/src/qa/qa.py
@@ -1,8 +1,9 @@
 from abc import ABC, abstractmethod
 from typing import Union, List, Tuple, Dict
 import pandas as pd
-from toolkit.src.ui.rich_ui import RichUI
+from src.ui.rich_ui import RichUI  
 import statistics
+from src.qa.qa_tests import *
 
 
 class LLMQaTest(ABC):
@@ -17,15 +18,27 @@ def get_metric(
     ) -> Union[float, int, bool]:
         pass
 
+class QaTestRegistry:
+    registry = {}
 
-class LLMTestSuite:
-    def __init__(
-        self,
-        tests: List[LLMQaTest],
-        prompts: List[str],
-        ground_truths: List[str],
-        model_preds: List[str],
-    ) -> None:
+    @classmethod
+    def register(cls, *names):
+        def inner_wrapper(wrapped_class):
+            for name in names:
+                cls.registry[name] = wrapped_class
+            return wrapped_class
+        return inner_wrapper
+
+    @classmethod 
+    def create_tests_from_list(cls, test_name: str) -> List[LLMQaTest]:
+        return [cls.create_test(test) for test in test_names]
+
+class LLMTestSuite():
+    def __init__(self, 
+                 tests:List[LLMQaTest],
+                 prompts:List[str],
+                 ground_truths:List[str],
+                 model_preds:List[str]) -> None:
 
         self.tests = tests
         self.prompts = prompts
diff --git a/src/qa/qa_tests.py b/src/qa/qa_tests.py
@@ -8,6 +8,7 @@
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
 from nltk import pos_tag
+from src.qa.qa import TestRegistry
 
 model_name = "distilbert-base-uncased"
 tokenizer = DistilBertTokenizer.from_pretrained(model_name)
@@ -17,22 +18,22 @@
 nltk.download("punkt")
 nltk.download("averaged_perceptron_tagger")
 
-
+@TestRegistry.register("summary_length")
 class LengthTest(LLMQaTest):
     @property
     def test_name(self) -> str:
-        return "Summary Length Test"
+        return "summary_length"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str
     ) -> Union[float, int, bool]:
         return abs(len(ground_truth) - len(model_prediction))
 
-
+@TestRegistry.register("jaccard_similarity")
 class JaccardSimilarityTest(LLMQaTest):
     @property
     def test_name(self) -> str:
-        return "Jaccard Similarity"
+        return "jaccard_similarity"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str
@@ -46,11 +47,11 @@ def get_metric(
         similarity = intersection_size / union_size if union_size != 0 else 0
         return similarity
 
-
+@TestRegistry.register("dot_product")
 class DotProductSimilarityTest(LLMQaTest):
     @property
     def test_name(self) -> str:
-        return "Semantic Similarity"
+        return "dot_product"
 
     def _encode_sentence(self, sentence):
         tokens = tokenizer(sentence, return_tensors="pt")
@@ -68,11 +69,11 @@ def get_metric(
         )
         return dot_product_similarity
 
-
+@TestRegistry.register("rouge_score")
 class RougeScoreTest(LLMQaTest):
     @property
     def test_name(self) -> str:
-        return "Rouge Score"
+        return "rouge_score"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str
@@ -81,11 +82,11 @@ def get_metric(
         scores = scorer.score(model_prediction, ground_truth)
         return float(scores["rouge1"].precision)
 
-
+@TestRegistry.register("word_overlap")
 class WordOverlapTest(LLMQaTest):
     @property
     def test_name(self) -> str:
-        return "Word Overlap Test"
+        return "word_overlap"
 
     def _remove_stopwords(self, text: str) -> str:
         stop_words = set(stopwords.words("english"))
@@ -115,11 +116,11 @@ def _get_pos_percent(self, text: str, pos_tags: List[str]) -> float:
         total_words = len(text.split(" "))
         return round(len(pos_words) / total_words, 2)
 
-
+@TestRegistry.register("verb_percent")
 class VerbPercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
-        return "Verb Composition"
+        return "verb_percent"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str
@@ -128,22 +129,22 @@ def get_metric(
             model_prediction, ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
         )
 
-
+@TestRegistry.register("adjective_percent")
 class AdjectivePercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
-        return "Adjective Composition"
+        return "adjective_percent"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str
     ) -> float:
         return self._get_pos_percent(model_prediction, ["JJ", "JJR", "JJS"])
 
-
+@TestRegistry.register("noun_percent")
 class NounPercent(PosCompositionTest):
     @property
     def test_name(self) -> str:
-        return "Noun Composition"
+        return "noun_percent"
 
     def get_metric(
         self, prompt: str, ground_truth: str, model_prediction: str