Skip to content

Commit b70bb1f

Browse files
committed
meeting week 16: move the evaluation function to the repo
1 parent 031309f commit b70bb1f

File tree

3 files changed

+455
-0
lines changed

3 files changed

+455
-0
lines changed

__init__.py

Whitespace-only changes.

app/evaluation_1to1.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import time
2+
import os
3+
import re
4+
import pandas as pd
5+
from langchain.schema.runnable import RunnableLambda
6+
from langchain_openai import ChatOpenAI
7+
from langchain.prompts import PromptTemplate
8+
from dotenv import load_dotenv
9+
10+
class Config:
11+
def __init__(self, mode='gpt', llama_version='3_1_8B', temperature=0.01, max_new_token=5):
12+
load_dotenv()
13+
14+
self.mode = mode # Options: 'gpt', 'llama3'
15+
self.llama_version = llama_version
16+
self.temperature = temperature
17+
self.max_new_token = max_new_token
18+
self.openai_api_key = os.getenv('OPENAI_API_KEY')
19+
self.huggingfacehub_api_token = os.getenv("HUGGINGFACE_AUTHORIZATION")
20+
self.endpoint_3_1_8B = os.getenv("LLAMA3_1_8B_ENDPOINT")
21+
22+
def setup_llm(config):
23+
"""Initialize the LLM model (GPT-4o or LLaMA 3) based on the given configuration."""
24+
if config.mode == 'gpt':
25+
return ChatOpenAI(
26+
model="gpt-4o-mini",
27+
temperature=config.temperature,
28+
max_tokens=config.max_new_token,
29+
openai_api_key=config.openai_api_key
30+
)
31+
elif config.mode == 'llama3':
32+
from langchain_huggingface import HuggingFaceEndpoint
33+
return HuggingFaceEndpoint(
34+
endpoint_url=config.endpoint_3_1_8B,
35+
max_new_tokens=config.max_new_token,
36+
temperature=config.temperature,
37+
huggingfacehub_api_token=config.huggingfacehub_api_token
38+
)
39+
40+
def evaluation_function(response, answer, config=None):
41+
"""Evaluates the given response against the answer using LLaMA 3 or GPT-4o."""
42+
start_time = time.process_time()
43+
44+
# Ensure config is provided
45+
if config is None:
46+
config = Config()
47+
48+
# Initialize LLM
49+
llm = setup_llm(config)
50+
51+
# Define prompt template
52+
prompt_template = PromptTemplate(
53+
template= '''
54+
### Instruction:
55+
Determine if the 2 words are semantically similar. Provide one of the following responses:
56+
- "True" if the words are semantically the same.
57+
- "False" if the words are semantically different.
58+
59+
### Examples:
60+
Word1: "happy", Word2: "happy"
61+
Response: True
62+
63+
Word1: "happy", Word2: "joyful"
64+
Response: True
65+
66+
Word1: "cat", Word 2: "dog"
67+
Response: False
68+
69+
Word1: "bank", Word 2: "actor"
70+
Response: False
71+
72+
### Input:
73+
Word1:{target}, Word2:{word}
74+
75+
### Response:
76+
''',
77+
input_variables=["target", "word"]
78+
)
79+
80+
# Helper function to extract True/False from model response (only extract the last one)
81+
def parse_last_boolean(response):
82+
matches = re.findall(r'\b(true|false)\b', response, re.IGNORECASE)
83+
return matches[-1].capitalize() if matches else "Unsure"
84+
85+
parser = RunnableLambda(parse_last_boolean)
86+
87+
# Define processing chain
88+
chain = prompt_template | llm
89+
90+
def recursive_evaluation(responses, answers):
91+
results = []
92+
matched_pairs = [] # Store matched word pairs
93+
unmatched_responses = [] # Store unmatched responses
94+
remaining_answers = set(answers) # Use a set for faster removal
95+
96+
for res in responses:
97+
matched_word = None
98+
for ans in list(remaining_answers): # Convert set to list for iteration
99+
eval_result = chain.invoke({"word": res, "target": ans})
100+
eval_result_content = eval_result.content if config.mode == 'gpt' else eval_result
101+
similarity_result = parser.invoke(eval_result_content)
102+
103+
if similarity_result == "True":
104+
matched_word = ans
105+
matched_pairs.append((res, ans))
106+
remaining_answers.discard(ans) # Ensure immediate removal
107+
break # Exit loop after first match
108+
109+
if matched_word:
110+
results.append(True)
111+
else:
112+
results.append(False)
113+
unmatched_responses.append(res)
114+
115+
return all(results), matched_pairs, unmatched_responses
116+
117+
# # LLM-based evaluation
118+
# response = chain.invoke({"word": response, "target": answer})
119+
120+
# # openAI and Huggingface has different ways to engage with parser, therefore invoke the parser seperately
121+
# is_correct = parser.invoke(response.content if config.mode == 'gpt' else response)
122+
# # similarity_result = parser.invoke(llm_output)
123+
if not (isinstance(response, list) and all(isinstance(item, str) for item in response) and
124+
isinstance(answer, list) and all(isinstance(item, str) for item in answer)):
125+
return {"is_correct": False, "error": "Invalid input: response and answer must be lists of strings."}
126+
127+
is_correct, correct_answers, incorrect_answers = recursive_evaluation(response, answer)
128+
return {
129+
"is_correct": is_correct,
130+
"result": {
131+
"response": {"corrrect": correct_answers,
132+
"incorrect": incorrect_answers},
133+
"processing_time": time.process_time() - start_time,
134+
"method": "LLM-based comparison"
135+
},
136+
"feedback": "Feedback generation agent not implemented yet."
137+
}
138+
139+
# Example Usage
140+
if __name__ == "__main__":
141+
custom_config = Config()
142+
print(evaluation_function(
143+
["Density","Density","Density"], #response
144+
["Density","Viscosity","Length","Density","Gravity","Viscosity","Length"], #answer
145+
custom_config
146+
))
147+
148+
# print(evaluation_function(
149+
# "Molecules are made out of atoms",
150+
# "Many atoms form a molecule",
151+
# {'keystrings': [{'string': 'molecule'}, {'string': 'proton', 'exact_match': True}]},
152+
# custom_config
153+
# ))

0 commit comments

Comments
 (0)