diff --git a/assignments_07/README.md b/assignments_07/README.md new file mode 100644 index 0000000..965996e --- /dev/null +++ b/assignments_07/README.md @@ -0,0 +1,141 @@ +# World Happiness Agent + +A mini-project built with `smolagents` and OpenAI's `CodeAgent` to explore the World Happiness dataset through natural language queries. + +## Project Overview + +This project demonstrates how an AI agent can combine predefined Python tools with dynamic code generation to analyze data conversationally. + +The agent can: + +- load the World Happiness dataset +- summarize dataset columns +- compute Pearson correlations with statistical significance +- rank countries by selected metrics +- generate custom visualizations when predefined tools are not sufficient + +## Technologies Used + +- Python +- Pandas +- Matplotlib +- SciPy +- smolagents +- OpenAI API +- dotenv + +## Project Structure + +```bash +assignments_07/ +│ +├── project_07.py +├── outputs/ +│ └── happiness_by_region.png +``` + +## Implemented Tools + +### load_happiness_data() + +Loads the merged World Happiness dataset into a shared global DataFrame. + +Returns: + +- dataset shape +- column names +- data path +- plot output path + +--- + +### summarize_column(column) + +Returns descriptive statistics for a selected column. + +Example: + +```python +summarize_column("happiness_score") +``` + +--- + +### compute_correlation(col1, col2) + +Computes: + +- Pearson correlation coefficient +- p-value + +Example: + +```python +compute_correlation("gdp_per_capita", "happiness_score") +``` + +--- + +### get_top_n_countries(column, year, n) + +Ranks countries by a selected metric for a given year. + +Example: + +```python +get_top_n_countries("happiness_score", 2020, 5) +``` + +## Example Queries + +Guided queries: + +- Load the happiness data and tell me its shape and column names +- Summarize the happiness_score column +- Compute correlation between GDP per capita and happiness score +- Show top 5 happiest countries in 2020 +- Generate a regional happiness trend chart + +Custom queries: + +- Which country improved its happiness score the most between 2015 and 2023? +- Which region has the highest average happiness score across all years? + +## Key Learning Outcomes + +This project helped demonstrate the difference between: + +**Tool-based agent behavior** +- safe +- predictable +- limited to predefined actions + +and + +**CodeAgent behavior** +- flexible +- capable of custom analysis +- able to generate Python code dynamically +- more prone to execution/runtime issues + +## How to Run + +Install dependencies: + +```bash +pip install -r requirements.txt +``` + +Run: + +```bash +python assignments_07/project_07.py +``` + +## Output + +Generated visualization: + +```bash +assignments_07/outputs/happiness_by_region.png +``` \ No newline at end of file diff --git a/assignments_07/outputs/happiness_by_region.png b/assignments_07/outputs/happiness_by_region.png new file mode 100644 index 0000000..d2feaad Binary files /dev/null and b/assignments_07/outputs/happiness_by_region.png differ diff --git a/assignments_07/outputs/my_plot.png b/assignments_07/outputs/my_plot.png new file mode 100644 index 0000000..86dc1d7 Binary files /dev/null and b/assignments_07/outputs/my_plot.png differ diff --git a/assignments_07/project_07.py b/assignments_07/project_07.py new file mode 100644 index 0000000..1337212 --- /dev/null +++ b/assignments_07/project_07.py @@ -0,0 +1,300 @@ +import os +from pathlib import Path + +import matplotlib + +matplotlib.use("Agg") + +import matplotlib.pyplot as plt +import pandas as pd +from dotenv import load_dotenv +from scipy.stats import pearsonr +from smolagents import CodeAgent, OpenAIServerModel, tool + + +# ================================================== +# Setup +# ================================================== + +load_dotenv() + +api_key = os.getenv("OPENAI_API_KEY") + +if not api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables.") + +BASE_DIR = Path(__file__).parent +OUTPUTS_DIR = BASE_DIR / "outputs" +OUTPUTS_DIR.mkdir(exist_ok=True) + +DATA_PATH = BASE_DIR.parent / "assignments_01" / "outputs" / "merged_happiness.csv" +PLOT_PATH = OUTPUTS_DIR / "happiness_by_region.png" + +df = None + +print("Environment loaded.") +print("Data path:", DATA_PATH) + + +# ================================================== +# Task 1 — Tool 1: load_happiness_data +# ================================================== + +@tool +def load_happiness_data() -> dict: + """ + Load the World Happiness dataset into memory. + + Returns: + A dictionary containing the shape, column names, data path, and output plot path. + """ + global df + + if not DATA_PATH.exists(): + return {"error": f"Data file not found at {DATA_PATH}"} + + df = pd.read_csv(DATA_PATH) + + return { + "shape": df.shape, + "columns": df.columns.tolist(), + "data_path": str(DATA_PATH), + "plot_path": str(PLOT_PATH), + } + + +# ================================================== +# Task 1 — Tool 2: summarize_column +# ================================================== + +@tool +def summarize_column(column: str) -> dict: + """ + Return descriptive statistics for a single column in the loaded dataset. + + Args: + column: The name of the column to summarize. + + Returns: + A dictionary with descriptive statistics for the selected column. + """ + global df + + if df is None: + return {"error": "No data is loaded. Please call load_happiness_data first."} + + if column not in df.columns: + return { + "error": f"Column '{column}' not found. Available columns: {df.columns.tolist()}" + } + + return df[column].describe().to_dict() + + +# ================================================== +# Task 1 — Tool 3: compute_correlation +# ================================================== + +@tool +def compute_correlation(col1: str, col2: str) -> dict: + """ + Compute the Pearson correlation coefficient and p-value between two numeric columns. + + Args: + col1: The name of the first numeric column. + col2: The name of the second numeric column. + + Returns: + A dictionary containing the correlation coefficient and p-value. + """ + global df + + if df is None: + return {"error": "No data is loaded. Please call load_happiness_data first."} + + if col1 not in df.columns or col2 not in df.columns: + return {"error": "One or both columns not found in dataset."} + + try: + pearson_r, p_value = pearsonr(df[col1], df[col2]) + + return { + "col1": col1, + "col2": col2, + "pearson_r": round(float(pearson_r), 4), + "p_value": round(float(p_value), 4), + } + + except Exception as e: + return {"error": str(e)} + + +# ================================================== +# Task 1 — Tool 4: get_top_n_countries +# ================================================== + +@tool +def get_top_n_countries(column: str, year: int, n: int = 5) -> dict: + """ + Return the top N countries ranked by a given column for a specific year. + + Args: + column: The numeric column used for ranking. + year: The year to filter the dataset by. + n: Number of top countries to return. + + Returns: + A dictionary containing the ranked countries and values. + """ + global df + + if df is None: + return {"error": "No data is loaded. Please call load_happiness_data first."} + + if column not in df.columns: + return {"error": f"Column '{column}' not found."} + + filtered_df = df[df["year"] == year] + + if filtered_df.empty: + return {"error": f"No data found for year {year}."} + + top_rows = filtered_df.sort_values(by=column, ascending=False).head(n) + + result = [] + + for _, row in top_rows.iterrows(): + result.append( + { + "country": row["country"], + column: row[column], + } + ) + + return {"results": result} + + +# ================================================== +# Task 2 — Build the Agent +# ================================================== + +model = OpenAIServerModel( + api_key=api_key, + model_id="gpt-4o-mini", +) + +SYSTEM_PROMPT = """ +You are a data analyst assistant for the World Happiness dataset. + +Use the available tools for: +- loading data +- summarizing columns +- computing correlations +- ranking countries + +Important rules — follow them exactly: +1. NEVER mock or generate fake data. Always use the tools to load real data. +2. When creating any plot, ALWAYS start with these exact lines FIRST: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt +3. For custom plots, call load_happiness_data() first, read the CSV from the + returned data_path using pandas, then save the plot to the exact absolute + path returned as plot_path. Never use a relative path like 'outputs/...'. + +Be concise and student-friendly in your responses. +""" + +agent = CodeAgent( + tools=[ + load_happiness_data, + summarize_column, + compute_correlation, + get_top_n_countries, + ], + model=model, + instructions=SYSTEM_PROMPT, + additional_authorized_imports=[ + "pandas", + "matplotlib", + "matplotlib.pyplot", + "scipy.stats", + ], + max_steps=8, +) + + +# ================================================== +# Task 3 — Run Guided Queries +# ================================================== + +if __name__ == "__main__": + queries = [ + "Load the happiness data and tell me its shape and column names.", + "Summarize the happiness_score column.", + "What is the correlation between gdp_per_capita and happiness_score? Is it statistically significant?", + "Show me the top 5 happiest countries in 2020.", + ( + "Plot happiness_score over the years as a line chart, with one line per region. " + "Use the data_path and plot_path from load_happiness_data. " + "Read the real CSV with pandas, group by year and regional_indicator, " + "and save the plot exactly to plot_path." + ), + ] + + for query in queries: + print(f"\n--- Query: {query} ---") + response = agent.run(query, reset=False) + print(response) + + print("\nExpected plot location:") + print(PLOT_PATH) + + # ================================================== + # Task 4 — Your Own Questions + # ================================================== + + my_query_1 = ( + "Which country improved its happiness_score the most between 2015 and 2023?" + ) + print(f"\n--- My Query 1: {my_query_1} ---") + response_1 = agent.run(my_query_1, reset=False) + print(response_1) + + # Comment: This triggered code generation because the agent needed to compare + # happiness_score changes between multiple years across countries. + + my_query_2 = ( + "Which region has the highest average happiness_score across all years?" + ) + print(f"\n--- My Query 2: {my_query_2} ---") + response_2 = agent.run(my_query_2, reset=False) + print(response_2) + + # Comment: This triggered code generation because the agent needed to group + # the dataset by region and calculate average happiness scores. + + # ================================================== +# Task 5 — Reflection +# ================================================== + +# --- Reflection --- +# +# 1. In Query 3, the agent communicated statistical significance by returning +# both the Pearson correlation coefficient and the p-value. It used the +# p-value correctly because a p-value of 0.0 is well below the standard +# significance threshold of 0.05, so the correlation was considered +# statistically significant. +# +# 2. One response that surprised me was Query 5, where the agent successfully +# generated custom pandas and matplotlib code to create a line chart grouped +# by region. This was more capable than I expected because none of the +# predefined tools directly supported multi-line plotting, so the agent had +# to reason through the steps and write the analysis code itself. +# +# 3. One additional tool that would make this agent more useful would be a +# custom plotting tool that accepts parameters such as x column, y column, +# grouping column, chart type, and output filename. This would allow the +# agent to create visualizations more reliably without needing to generate +# raw Python code, and it would help answer questions like: +# "Create a bar chart of average happiness_score by region for 2020." diff --git a/assignments_07/resources/bike_commute.csv b/assignments_07/resources/bike_commute.csv new file mode 100644 index 0000000..f96d1fa --- /dev/null +++ b/assignments_07/resources/bike_commute.csv @@ -0,0 +1,161 @@ +distance_km,duration_min,avg_speed_kmh,avg_heart_rate,avg_traffic_density,rain +10.5,35.6,17.4,123,2,1 +9.86,47.4,12,113,7,0 +10.65,63.9,9.9,113,8,1 +11.52,44.2,15.9,117,3,0 +9.77,42.1,14.4,115,8,0 +9.77,38.7,14.5,116,6,0 +11.58,73.3,9.7,122,7,1 +10.77,58.1,11.7,119,10,1 +9.53,47.9,12.1,119,4,1 +10.54,44,15.7,117,3,1 +9.54,32.9,16.6,115,2,0 +9.53,36.8,14.6,118,3,0 +10.24,39.4,14.3,112,5,0 +8.09,40.3,13,118,6,0 +8.28,35.4,14.6,117,6,0 +9.44,39.7,14.2,119,7,1 +8.99,40.5,13.5,115,5,0 +10.31,41.9,14,120,9,0 +9.09,42.5,14.5,116,4,0 +8.59,48.8,10.6,113,10,1 +11.47,53.6,12.9,116,7,1 +9.77,42,14.5,112,2,0 +10.07,40,15.5,113,2,0 +8.58,45.5,11.4,118,6,1 +9.46,29.6,18.2,120,4,0 +10.11,47.6,13,115,7,1 +8.85,47.3,12.2,118,6,1 +10.38,43.1,15.4,117,5,0 +9.4,38.8,15.8,116,2,0 +9.71,37.1,15.3,120,0,1 +9.4,32.3,16.5,119,4,0 +11.85,46.8,15.1,119,8,0 +9.99,47.1,12.7,117,6,1 +8.94,39.4,14.4,111,1,1 +10.82,45.9,13.2,119,6,1 +8.78,56.3,9.9,114,6,1 +10.21,44.1,13.8,112,2,0 +8.04,27.7,16.9,115,5,0 +8.67,30.1,16.2,119,5,1 +10.2,34.9,16,117,2,0 +10.74,43.4,15.4,117,6,0 +10.17,45,13.6,121,7,1 +9.88,29.8,18.3,123,8,0 +9.7,35.2,15.4,118,8,0 +8.52,30.4,16.5,117,1,0 +9.28,41.2,14.7,111,2,0 +9.54,47.3,12,112,7,0 +11.06,42.2,14.7,115,7,0 +10.34,46.4,13.2,112,7,0 +8.24,50.6,9.7,115,10,1 +10.32,45.6,12.1,118,7,1 +9.61,53.2,10.8,115,8,1 +9.32,36.4,15.2,127,8,1 +10.61,42.4,15.5,115,7,0 +11.03,46.8,15.4,114,4,0 +10.93,45.7,15.1,121,7,1 +9.16,33.7,16,119,3,0 +9.69,43.2,12.8,117,4,1 +10.33,39.8,17.9,121,4,0 +10.98,45.5,14.5,115,5,1 +9.52,49.9,11.4,115,10,0 +9.81,37.9,15.5,120,0,1 +8.89,41.9,12.9,114,7,0 +8.8,38.4,13.7,117,0,1 +10.81,37.1,17,115,4,0 +11.36,43.3,15.4,128,8,1 +9.93,51.2,11.6,122,5,1 +11,37.4,17.1,115,2,0 +10.36,40.3,14.9,112,3,0 +9.35,46,12.3,122,7,1 +10.36,41,15,118,3,0 +11.54,54.5,13.4,112,6,0 +9.96,34.5,15,114,5,0 +11.56,54.2,13.3,115,3,1 +7.38,35.5,13.4,113,10,0 +10.82,41.7,14.2,113,7,0 +10.09,38.5,15.4,114,0,1 +9.7,46.9,12.2,114,6,1 +10.09,37.7,14.9,113,3,0 +8.01,33,13.9,121,8,1 +9.78,32.9,16.7,120,3,0 +10.36,49,13.7,110,5,0 +11.48,49.2,14.5,119,7,0 +9.48,40.2,15.1,117,8,0 +9.19,41,13.9,111,1,0 +9.5,32.9,16.2,118,4,0 +10.92,47.9,13.4,113,4,1 +10.33,41.1,15.4,120,3,1 +9.47,50.9,10.7,118,10,1 +10.51,54.9,11.8,113,6,0 +10.1,42.6,14.1,112,1,0 +10.97,50.8,12.8,126,8,1 +9.3,47.2,12.2,118,10,1 +9.67,47.7,12.4,112,8,0 +9.61,34,16.6,117,0,1 +8.54,45.4,11.9,113,4,1 +10.3,48.6,12.2,117,9,1 +10.26,49.2,12.8,119,3,1 +10.01,50,12.3,121,6,1 +9.77,40.8,14.1,114,7,0 +8.58,38.5,13.6,113,2,1 +9.58,40.7,13.3,119,5,1 +9.66,33.2,18.5,115,0,0 +9.2,35.5,15.4,115,2,0 +9.84,34.6,16.6,118,4,0 +10.4,48,13.6,111,1,1 +11.89,55,12.6,125,10,1 +10.17,29.5,18.9,123,1,0 +10.26,49,11.8,108,4,0 +9.93,44.8,13.6,112,5,0 +8.08,34.9,12.9,121,9,1 +9.97,42.8,15.2,118,1,1 +10.06,42.4,13,122,8,1 +12.46,56.5,14.1,120,5,0 +9.81,36.5,16.3,114,2,0 +10.3,43.3,14.2,113,6,0 +9.97,35.1,16.5,114,6,0 +8.83,36.4,14.9,116,3,1 +11.14,46.9,14.2,119,5,1 +10.75,49.5,13.6,123,4,1 +10.79,50.3,12.9,116,5,1 +9.09,40.5,13.6,115,7,0 +11.4,54.1,12.5,120,10,1 +8.6,35.1,14.7,120,1,1 +10.59,52.5,12.2,128,10,1 +12.19,32.9,20.1,117,0,0 +9.01,30.1,16.5,116,5,0 +9.43,42.5,13.8,118,7,0 +10.1,40,15.3,123,6,1 +9.5,37.3,15.1,119,3,0 +8.45,41.6,12.2,112,4,0 +10.07,43.6,14.1,112,4,0 +8.94,40.5,12.9,112,3,0 +10.47,45.2,13.5,118,8,0 +9.08,37.7,14.6,118,6,0 +11.55,35.8,18.3,120,3,0 +9.22,39.2,14.4,120,8,0 +9.68,40.9,13.1,122,6,1 +10.81,44.6,15.3,122,7,0 +8.77,55,9.7,116,7,1 +10.23,38,16.3,115,3,0 +11.31,42.3,16.8,117,3,0 +8.39,50.5,10.7,119,7,1 +10.18,40.3,16,116,7,0 +10.26,35.9,15.5,117,5,0 +10.78,42.9,14.2,113,5,0 +8.76,38.5,13.2,121,9,1 +8.68,27.3,19.1,124,3,0 +10.52,48.1,13.4,120,7,1 +10.3,38.1,15.6,116,4,0 +10.25,42.6,14.6,110,4,0 +10.35,48.5,12.4,116,8,0 +9.32,35.4,15.3,119,7,0 +10.23,50.4,11.5,123,7,1 +10.29,43.8,13.5,115,9,0 +9.29,36.4,14.3,115,5,0 +11.87,50,13.7,121,7,1 +10.47,41.7,15.9,120,4,0 +8.81,30.8,16.2,117,6,0 +10.66,53.7,13.2,118,5,1 diff --git a/assignments_07/warmup_07.py b/assignments_07/warmup_07.py new file mode 100644 index 0000000..e2cdadd --- /dev/null +++ b/assignments_07/warmup_07.py @@ -0,0 +1,764 @@ +# ================================================== +# Lesson 02: Tool Definitions and the ReAct Loop +# ================================================== + +import os +import json +from datetime import datetime + +from dotenv import load_dotenv +from openai import OpenAI + + +# ================================================== +# Setup +# ================================================== + +if load_dotenv(): + print("Successfully loaded environment variables from .env") +else: + print("Warning: could not load environment variables from .env") + +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) +print("OpenAI client created.") + + +# ================================================== +# Q1 +# ================================================== + +def celsius_to_fahrenheit(celsius: float) -> str: + """Convert a Celsius temperature to Fahrenheit and return it as a formatted string.""" + fahrenheit = (celsius * 9 / 5) + 32 + return f"{celsius}°C is {fahrenheit}°F" + + +celsius_to_fahrenheit_schema = { + "type": "function", + "function": { + "name": "celsius_to_fahrenheit", + "description": "Convert a Celsius temperature to Fahrenheit.", + "parameters": { + "type": "object", + "properties": { + "celsius": { + "type": "number", + "description": "Temperature in Celsius", + } + }, + "required": ["celsius"], + }, + }, +} + + +print(celsius_to_fahrenheit(0)) +print(celsius_to_fahrenheit(100)) +print(celsius_to_fahrenheit(-40)) + +""" +Output: +0°C is 32.0°F +100°C is 212.0°F +-40°C is -40.0°F +""" + + +# ================================================== +# Q2 +# ================================================== + +""" +Prediction: + +Will calling run_agent("Convert 100 degrees Celsius to Fahrenheit") trigger a tool call? +No. + +Why? +Because this version of the agent only has one available tool: get_current_time. +The query asks for a temperature conversion, not the current time. + +How many API calls will be made? +One API call. + +If the model does not request a tool, the first response is already the final answer. +""" + + +def get_current_time() -> str: + """Return the current local time as a formatted string.""" + return datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +time_tool_schema = { + "type": "function", + "function": { + "name": "get_current_time", + "description": "Returns the current local time as a string.", + "parameters": { + "type": "object", + "properties": {}, + "required": [], + }, + }, +} + + +tools = [time_tool_schema] +print("Tools list defined with one tool: get_current_time") + + +def run_agent(user_prompt: str) -> str: + """Run a minimal ReAct-style agent for a single user prompt.""" + + SYSTEM_PROMPT = """ + You are a simple assistant that can tell the current time and convert Celsius to Fahrenheit. + Use tools when they are helpful. + """ + + messages = [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ] + + first_response = client.chat.completions.create( + model="gpt-4.1-mini", + messages=messages, + tools=tools, + tool_choice="auto", + ) + + print("First response received from model...") + print(first_response) + + first_message = first_response.choices[0].message + + messages.append( + { + "role": "assistant", + "content": first_message.content, + "tool_calls": first_message.tool_calls, + } + ) + + if first_message.tool_calls: + print("Agentic mode engaged...") + + for tool_call in first_message.tool_calls: + function_name = tool_call.function.name + + if function_name == "get_current_time": + tool_result = get_current_time() + + elif function_name == "celsius_to_fahrenheit": + args = json.loads(tool_call.function.arguments) + tool_result = celsius_to_fahrenheit(args["celsius"]) + + else: + tool_result = f"Error: unknown tool {function_name}." + + print("Tool called:", function_name) + print("Tool result:", tool_result) + + messages.append( + { + "role": "tool", + "tool_call_id": tool_call.id, + "name": function_name, + "content": tool_result, + } + ) + + second_response = client.chat.completions.create( + model="gpt-4.1-mini", + messages=messages, + ) + + print("Second response received from model...") + print(second_response) + + final_message = second_response.choices[0].message + return final_message.content or "" + + print("No tools needed....") + return first_message.content or "" + + +answer_with_agent = run_agent("Convert 100 degrees Celsius to Fahrenheit") +print(answer_with_agent) + +# Was my prediction correct? +# Yes. The model answered directly without calling the tool because +# get_current_time is unrelated to temperature conversion. + + +# ================================================== +# Q3 +# ================================================== + +tools = [ + time_tool_schema, + celsius_to_fahrenheit_schema, +] + +print("Tools list updated with two tools: get_current_time and celsius_to_fahrenheit") + +response_a = run_agent("What is 37 degrees Celsius in Fahrenheit?") +print("Response A:", response_a) + +# A tool was called because the user explicitly requested a temperature conversion, +# and the celsius_to_fahrenheit tool was available to perform that calculation. + +response_b = run_agent("What is the boiling point of water in plain English?") +print("Response B:", response_b) + +# No tool was called because this was a conceptual knowledge question. +# The model already knows the boiling point of water from its training data. + +# ================================================== +# Lesson 03: Multi-Tool Agent +# Q4 +# ================================================== + +from pathlib import Path + +import matplotlib.pyplot as plt +import pandas as pd +from scipy.stats import pearsonr + + +RESOURCES_DIR = Path(__file__).parent / "resources" + + +class CsvManager: + def __init__(self, resources_dir: Path): + self.resources_dir = resources_dir + self.df = None + self.csv_name = None + + def _normalize_csv_name(self, filename: str) -> str: + if not filename.lower().endswith(".csv"): + return filename + ".csv" + return filename + + def _available_csv_files(self) -> list[str]: + if not self.resources_dir.exists(): + return [] + return sorted( + [ + p.name + for p in self.resources_dir.iterdir() + if p.is_file() and p.suffix.lower() == ".csv" + ] + ) + + def _ensure_loaded(self): + if self.df is None: + files = self._available_csv_files() + example = files[0] if files else "your_file.csv" + return { + "error": ( + "No CSV is loaded yet. First load one from resources/. " + f"For example: load_csv '{example}'." + ) + } + return None + + def list_csv_files(self): + """List available CSV files in resources/.""" + files = self._available_csv_files() + if not files: + return { + "message": ( + "No CSV files found in resources/. " + "Create a resources/ folder and put one or more .csv files inside it." + ), + "files": [], + } + return {"files": files} + + def load_csv(self, filename: str): + """Load a CSV file from resources/ and make it the active dataset.""" + filename = self._normalize_csv_name(filename) + path = self.resources_dir / filename + + if not path.exists(): + return { + "error": f"Could not find '{filename}' in resources/.", + "available_files": self._available_csv_files(), + } + + self.df = pd.read_csv(path) + self.csv_name = filename + + return { + "message": f"Loaded {filename} with shape {self.df.shape}.", + "columns": self.df.columns.tolist(), + } + + def get_columns(self): + """Return column names for the currently loaded CSV.""" + error = self._ensure_loaded() + if error: + return error + return self.df.columns.tolist() + + def summarize_columns(self, columns: list[str] | None = None): + """Return basic summary stats for one or more columns.""" + error = self._ensure_loaded() + if error: + return error + + if columns is None: + data = self.df + else: + missing = [c for c in columns if c not in self.df.columns] + if missing: + return {"error": f"These columns are not in the data: {missing}"} + data = self.df[columns] + + summary = data.describe(include="all").transpose().round(3) + return summary.to_dict() + + def describe_column(self, column: str): + """Simple summary for a single column using pandas.describe().""" + error = self._ensure_loaded() + if error: + return error + + if column not in self.df.columns: + return {"error": f"'{column}' is not a column. Options: {self.df.columns.tolist()}"} + + s = self.df[column] + summary = s.describe().to_dict() + + cleaned = {} + for key, value in summary.items(): + if isinstance(value, (int, float)): + cleaned[key] = round(value, 3) + else: + cleaned[key] = value + + return cleaned + + def plot_data(self, y: str, x: str | None = None, plot_type: str = "line"): + """Plot from the active CSV.""" + error = self._ensure_loaded() + if error: + return error + + if plot_type not in ["scatter", "line"]: + return "Error: I can only do 'scatter' or 'line'." + + if y not in self.df.columns: + return f"Error: column '{y}' is not in {self.df.columns.tolist()}" + + if x == y: + x = None + + if plot_type == "scatter" and x is None: + return "Error: scatter plots need both x and y columns." + + title_csv = self.csv_name or "current CSV" + + if x is None: + ax = self.df[y].plot(kind="line") + ax.set_title(f"{title_csv} | Line plot: {y} vs row index") + plt.show() + + return f"Plotted {y} vs row index as a line plot." + + if x not in self.df.columns: + return f"Error: column '{x}' is not in {self.df.columns.tolist()}" + + ax = self.df.plot(x=x, y=y, kind=plot_type) + ax.set_title(f"{title_csv} | {plot_type.title()} plot: {y} vs {x}") + plt.savefig("assignments_07/outputs/my_plot.png") + + + return f"Plotted {y} vs {x} as a {plot_type}." + + def compute_correlation(self, col1: str, col2: str): + """ + Compute the Pearson correlation between two columns in the loaded DataFrame. + Returns the correlation coefficient and p-value. + """ + error = self._ensure_loaded() + if error: + return error + + missing = [col for col in [col1, col2] if col not in self.df.columns] + if missing: + return {"error": f"These columns are not in the data: {missing}"} + + pearson_r, p_value = pearsonr(self.df[col1], self.df[col2]) + + return { + "col1": col1, + "col2": col2, + "pearson_r": round(float(pearson_r), 4), + "p_value": round(float(p_value), 4), + } + + +print("Class defined") + +csv_backend = CsvManager(RESOURCES_DIR) + +node_tools = { + "list_csv_files": csv_backend.list_csv_files, + "load_csv": csv_backend.load_csv, + "get_columns": csv_backend.get_columns, + "summarize_columns": csv_backend.summarize_columns, + "describe_column": csv_backend.describe_column, + "plot_data": csv_backend.plot_data, + "compute_correlation": csv_backend.compute_correlation, +} + +tools_schema = [ + { + "type": "function", + "function": { + "name": "list_csv_files", + "description": "List available CSV files in the resources/ folder.", + }, + }, + { + "type": "function", + "function": { + "name": "load_csv", + "description": "Load a CSV file from the resources/ folder and make it the active dataset.", + "parameters": { + "type": "object", + "properties": { + "filename": { + "type": "string", + "description": "CSV filename in resources/, e.g. 'bike_commute.csv'.", + } + }, + "required": ["filename"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "get_columns", + "description": "Get the column names of the currently loaded CSV.", + }, + }, + { + "type": "function", + "function": { + "name": "summarize_columns", + "description": "Show basic summary statistics for columns.", + "parameters": { + "type": "object", + "properties": { + "columns": { + "type": "array", + "items": {"type": "string"}, + "description": "Optional list of column names. If omitted, summarize all columns.", + } + }, + }, + }, + }, + { + "type": "function", + "function": { + "name": "describe_column", + "description": "Show basic summary statistics for a single column.", + "parameters": { + "type": "object", + "properties": { + "column": { + "type": "string", + "description": "Column name to describe.", + } + }, + "required": ["column"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "plot_data", + "description": "Plot data from the active CSV. If only y is provided, plot y vs row index.", + "parameters": { + "type": "object", + "properties": { + "y": {"type": "string", "description": "Column name for y-axis."}, + "x": {"type": "string", "description": "Optional column name for x-axis."}, + "plot_type": { + "type": "string", + "enum": ["scatter", "line"], + "description": "Type of plot to create.", + }, + }, + "required": ["y"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "compute_correlation", + "description": "Compute the Pearson correlation coefficient and p-value between two columns.", + "parameters": { + "type": "object", + "properties": { + "col1": { + "type": "string", + "description": "First column name.", + }, + "col2": { + "type": "string", + "description": "Second column name.", + }, + }, + "required": ["col1", "col2"], + }, + }, + }, +] + +# ================================================== +# Q5 +# ================================================== + +def run_agent_cycle(messages, user_text, max_tool_rounds=5): + """ + Run through one ReAct agent loop using tool calling. + """ + messages.append({"role": "user", "content": user_text}) + + def observe_tool_result(tool_call_id, result): + content = json.dumps(result, default=str) if not isinstance(result, str) else result + + return { + "role": "tool", + "tool_call_id": tool_call_id, + "content": content, + } + + for loop_idx in range(max_tool_rounds): + response = client.chat.completions.create( + model="gpt-4.1-mini", + messages=messages, + tools=tools_schema, + ) + + msg = response.choices[0].message + + assistant_entry = { + "role": "assistant", + "content": msg.content, + } + + if msg.tool_calls: + assistant_entry["tool_calls"] = [tc.model_dump() for tc in msg.tool_calls] + + messages.append(assistant_entry) + + if not msg.tool_calls: + return msg.content + + for tool_call in msg.tool_calls: + name = tool_call.function.name + tool_args = json.loads(tool_call.function.arguments or "{}") + + print(f"ACT: {name}({tool_args})") + + fn = node_tools.get(name) + + if fn is None: + result = {"error": f"Tool '{name}' not found."} + else: + try: + result = fn(**tool_args) if tool_args else fn() + except Exception as e: + result = {"error": f"Tool '{name}' failed: {type(e).__name__}: {e}"} + + messages.append(observe_tool_result(tool_call.id, result)) + + return "I hit the tool-round limit. Try a simpler request." + + +SYSTEM_PROMPT = ( + "You are a small data assistant for CSV files stored in resources/. " + "Use the available tools to do any data work (do not guess). " + "If no CSV is loaded yet, load one first (or list available CSV files). " + "Keep answers short and student-friendly." +) + + +messages = [{"role": "system", "content": SYSTEM_PROMPT}] + +result = run_agent_cycle( + messages, + "Load bike_commute.csv and compute the correlation between avg_traffic_density and avg_speed_kmh.", +) + +print(result) + +''' +Output: + +ACT: list_csv_files({}) +ACT: load_csv({'filename': 'bike_commute.csv'}) +ACT: compute_correlation({'col1': 'avg_traffic_density', 'col2': 'avg_speed_kmh'}) +The correlation between average traffic density and average speed (km/h) is approximately -0.53. +This indicates a moderate negative correlation, meaning that as traffic density increases, average speed tends to decrease. +The p-value is 0.0, showing this correlation is statistically significant. +''' + +# ================================================== +# Q6 +# ================================================== + +# In the ReAct loop: +# system = instructions that define the agent's behavior and rules. +# user = the user's request or question. +# assistant = the model's reasoning step, either a final answer or a request to call tools. +# tool = the observation step, where Python returns the result of the requested tool call. + +print(json.dumps(messages, indent=2, default=str)) + +# ================================================== +# Lesson 04: smolagents +# Q7 +# ================================================== + +from smolagents import tool + + +@tool +def compute_correlation_tool(col1: str, col2: str) -> dict: + """ + Compute the Pearson correlation between two columns in the currently loaded CSV file. + + Args: + col1: The name of the first numeric column. + col2: The name of the second numeric column. + """ + return csv_backend.compute_correlation(col1, col2) + + +print(compute_correlation_tool.description) + +# Comparison: +# In Q4 we manually wrote a full JSON schema with function name, +# description, parameters, and required fields. +# smolagents generates this automatically from the function signature +# and docstring. + +# To generate a good description, the developer must provide: +# 1. clear function name +# 2. meaningful parameter names +# 3. helpful docstring describing what the tool does + +# ================================================== +# Q8 +# ================================================== + +from smolagents import ToolCallingAgent, CodeAgent, OpenAIServerModel + + +model = OpenAIServerModel( + model_id="gpt-4.1-mini", + api_key=os.getenv("OPENAI_API_KEY"), +) + + +@tool +def load_csv_tool(filename: str) -> dict: + """ + Load a CSV file from the resources directory. + + Args: + filename: The CSV filename to load. + """ + return csv_backend.load_csv(filename) + + +@tool +def plot_data_tool(y: str, x: str = None, plot_type: str = "line") -> str: + """ + Plot data from the currently loaded CSV file. + + Args: + y: Column name for y-axis. + x: Optional column name for x-axis. + plot_type: Plot type, either scatter or line. + """ + return csv_backend.plot_data(y, x, plot_type) + + +TOOLS = [ + load_csv_tool, + plot_data_tool, + compute_correlation_tool, +] + + +tool_agent = ToolCallingAgent( + tools=TOOLS, + model=model, +) + +code_agent = CodeAgent( + tools=TOOLS, + model=model, +) + +prompt = "Load bike_commute.csv. Plot avg_heart_rate vs duration_min as a scatter plot with green dots." + +response_tool = tool_agent.run(prompt) +print("ToolCallingAgent response:", response_tool) + +response_code = code_agent.run( + prompt, + additional_args={"csv_manager": csv_backend}, +) +print("CodeAgent response:", response_code) + +""" +Comparison: + +ToolCallingAgent used the predefined tools: +- load_csv_tool +- plot_data_tool + +It created a scatter plot, but the dots were the default matplotlib color, not green. +This happened because plot_data_tool does not have a color parameter, so the agent could not pass "green" into the tool. + +CodeAgent attempted to generate and execute Python code dynamically. +However, it failed because matplotlib tried to create a GUI figure from a worker thread on macOS. +This shows both the strength and the risk of CodeAgent: +it is more flexible because it can write code, but it can also run into execution/environment errors. + +A ToolCallingAgent is better for controlled, predictable workflows. +A CodeAgent is more useful when the task requires custom logic that was not built into the predefined tools. +""" + +# ================================================== +# Q9 +# ================================================== + +""" +A ToolCallingAgent would be a better choice for a task like loading a CSV file, +summarizing columns, or computing a predefined statistic such as correlation. + +This is a good fit for a tool-based approach because the task has clear, +limited actions and safe predefined tools. The agent only needs to choose +which tool to call and what arguments to pass. + +One meaningful risk of using a CodeAgent is that it can generate and execute +unexpected or unsafe code. For example, it may try to import unauthorized +libraries, access attributes that do not exist, modify files, or fail because +of environment-specific issues. A ToolCallingAgent does not have this same +level of risk because it can only call the tools that the developer explicitly +provided. +""" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4c71436..ba83406 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,7 @@ llama-index-core==0.14.10 llama-index-embeddings-openai==0.6.0 llama-index-llms-openai==0.7.8 llama-index-readers-file==0.6.0 -pypdf==6.11.0 \ No newline at end of file +pypdf==6.11.0 +hf-xet-==1.5.0 +huggingface-hub==1.16.1 +smolagents==1.25.0 \ No newline at end of file