1+ import time
2+ import os
3+ import re
4+ import pandas as pd
5+ from langchain .schema .runnable import RunnableLambda
6+ from langchain_openai import ChatOpenAI
7+ from langchain .prompts import PromptTemplate
8+ from dotenv import load_dotenv
9+
10+ class Config :
11+ def __init__ (self , mode = 'gpt' , llama_version = '3_1_8B' , temperature = 0.01 , max_new_token = 5 ):
12+ load_dotenv ()
13+
14+ self .mode = mode # Options: 'gpt', 'llama3'
15+ self .llama_version = llama_version
16+ self .temperature = temperature
17+ self .max_new_token = max_new_token
18+ self .openai_api_key = os .getenv ('OPENAI_API_KEY' )
19+ self .huggingfacehub_api_token = os .getenv ("HUGGINGFACE_AUTHORIZATION" )
20+ self .endpoint_3_1_8B = os .getenv ("LLAMA3_1_8B_ENDPOINT" )
21+
22+ def setup_llm (config ):
23+ """Initialize the LLM model (GPT-4o or LLaMA 3) based on the given configuration."""
24+ if config .mode == 'gpt' :
25+ return ChatOpenAI (
26+ model = "gpt-4o-mini" ,
27+ temperature = config .temperature ,
28+ max_tokens = config .max_new_token ,
29+ openai_api_key = config .openai_api_key
30+ )
31+ elif config .mode == 'llama3' :
32+ from langchain_huggingface import HuggingFaceEndpoint
33+ return HuggingFaceEndpoint (
34+ endpoint_url = config .endpoint_3_1_8B ,
35+ max_new_tokens = config .max_new_token ,
36+ temperature = config .temperature ,
37+ huggingfacehub_api_token = config .huggingfacehub_api_token
38+ )
39+
40+ def evaluation_function (response , answer , config = None ):
41+ """Evaluates the given response against the answer using LLaMA 3 or GPT-4o."""
42+ start_time = time .process_time ()
43+
44+ # Ensure config is provided
45+ if config is None :
46+ config = Config ()
47+
48+ # Initialize LLM
49+ llm = setup_llm (config )
50+
51+ # Define prompt template
52+ prompt_template = PromptTemplate (
53+ template = '''
54+ ### Instruction:
55+ Determine if the 2 words are semantically similar. Provide one of the following responses:
56+ - "True" if the words are semantically the same.
57+ - "False" if the words are semantically different.
58+
59+ ### Examples:
60+ Word1: "happy", Word2: "happy"
61+ Response: True
62+
63+ Word1: "happy", Word2: "joyful"
64+ Response: True
65+
66+ Word1: "cat", Word 2: "dog"
67+ Response: False
68+
69+ Word1: "bank", Word 2: "actor"
70+ Response: False
71+
72+ ### Input:
73+ Word1:{target}, Word2:{word}
74+
75+ ### Response:
76+ ''' ,
77+ input_variables = ["target" , "word" ]
78+ )
79+
80+ # Helper function to extract True/False from model response (only extract the last one)
81+ def parse_last_boolean (response ):
82+ matches = re .findall (r'\b(true|false)\b' , response , re .IGNORECASE )
83+ return matches [- 1 ].capitalize () if matches else "Unsure"
84+
85+ parser = RunnableLambda (parse_last_boolean )
86+
87+ # Define processing chain
88+ chain = prompt_template | llm
89+
90+ def recursive_evaluation (responses , answers ):
91+ results = []
92+ matched_pairs = [] # Store matched word pairs
93+ unmatched_responses = [] # Store unmatched responses
94+ remaining_answers = set (answers ) # Use a set for faster removal
95+
96+ for res in responses :
97+ matched_word = None
98+ for ans in list (remaining_answers ): # Convert set to list for iteration
99+ eval_result = chain .invoke ({"word" : res , "target" : ans })
100+ eval_result_content = eval_result .content if config .mode == 'gpt' else eval_result
101+ similarity_result = parser .invoke (eval_result_content )
102+
103+ if similarity_result == "True" :
104+ matched_word = ans
105+ matched_pairs .append ((res , ans ))
106+ remaining_answers .discard (ans ) # Ensure immediate removal
107+ break # Exit loop after first match
108+
109+ if matched_word :
110+ results .append (True )
111+ else :
112+ results .append (False )
113+ unmatched_responses .append (res )
114+
115+ return all (results ), matched_pairs , unmatched_responses
116+
117+ # # LLM-based evaluation
118+ # response = chain.invoke({"word": response, "target": answer})
119+
120+ # # openAI and Huggingface has different ways to engage with parser, therefore invoke the parser seperately
121+ # is_correct = parser.invoke(response.content if config.mode == 'gpt' else response)
122+ # # similarity_result = parser.invoke(llm_output)
123+ if not (isinstance (response , list ) and all (isinstance (item , str ) for item in response ) and
124+ isinstance (answer , list ) and all (isinstance (item , str ) for item in answer )):
125+ return {"is_correct" : False , "error" : "Invalid input: response and answer must be lists of strings." }
126+
127+ is_correct , correct_answers , incorrect_answers = recursive_evaluation (response , answer )
128+ return {
129+ "is_correct" : is_correct ,
130+ "result" : {
131+ "response" : {"corrrect" : correct_answers ,
132+ "incorrect" : incorrect_answers },
133+ "processing_time" : time .process_time () - start_time ,
134+ "method" : "LLM-based comparison"
135+ },
136+ "feedback" : "Feedback generation agent not implemented yet."
137+ }
138+
139+ # Example Usage
140+ if __name__ == "__main__" :
141+ custom_config = Config ()
142+ print (evaluation_function (
143+ ["Density" ,"Density" ,"Density" ], #response
144+ ["Density" ,"Viscosity" ,"Length" ,"Density" ,"Gravity" ,"Viscosity" ,"Length" ], #answer
145+ custom_config
146+ ))
147+
148+ # print(evaluation_function(
149+ # "Molecules are made out of atoms",
150+ # "Many atoms form a molecule",
151+ # {'keystrings': [{'string': 'molecule'}, {'string': 'proton', 'exact_match': True}]},
152+ # custom_config
153+ # ))
0 commit comments