From c0243c13d4ecfafef7a6de73f9b9ec647e979b6e Mon Sep 17 00:00:00 2001 From: JoeXic Date: Fri, 24 Oct 2025 17:47:20 +0100 Subject: [PATCH 01/11] Add guidance --- guide.md | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 guide.md diff --git a/guide.md b/guide.md new file mode 100644 index 0000000..51082bb --- /dev/null +++ b/guide.md @@ -0,0 +1,43 @@ +## Git 管理 +- 开源项目的工作流一般都是 fork + 提PR +- 一个分支本质上是代码历史中指向某个commit的指针 +- feature/monitor branch: 跑benchmark的时候监控中间过程 + +## Reproducing GAIA Validation Benchmark Results + +**Prepara GAIA vaidation dataset:** +```bash +cd data +wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip +unzip gaia-val.zip +# Unzip passcode: pf4* +``` + +**API key configuration:** +```bash +touch .env +nano .env +``` + +**Run GAIA validation with Claude 3.7 Sonnet** + +```bash +uv run main.py common-benchmark \ + --config_file_name=agent_gaia-validation_claude37sonnet \ + output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" + ``` + +**Checking progress:** +```bash +uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG +``` + +**Resume running if interrupted:** +```bash +uv run main.py common-benchmark \ + --config_file_name=agent_gaia-validation_claude37sonnet.yaml \ + output_dir="$PATH_TO_LOG" +``` + + +## Other Benchmark Datasets \ No newline at end of file From 349fb323d35c3f1ee98221af8cfaea150988fa0b Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sat, 1 Nov 2025 16:47:25 +0000 Subject: [PATCH 02/11] feat: Add GAIA benchmark monitoring with web dashboard - Add run-gaia-with-monitor command for running benchmark with real-time monitoring - Add web dashboard for monitoring benchmark progress (gaia_web_monitor.py) - Add generate_gaia_report.py to utils/progress_check/ for generating task reports --- main.py | 3 +- run_gaia_with_monitor.py | 122 +++++ utils/progress_check/gaia_web_monitor.py | 549 +++++++++++++++++++ utils/progress_check/generate_gaia_report.py | 168 ++++++ 4 files changed, 841 insertions(+), 1 deletion(-) create mode 100644 run_gaia_with_monitor.py create mode 100644 utils/progress_check/gaia_web_monitor.py create mode 100644 utils/progress_check/generate_gaia_report.py diff --git a/main.py b/main.py index eaa5d81..c0b232c 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import utils.calculate_average_score import utils.calculate_score_from_log import common_benchmark +import run_gaia_with_monitor import dotenv import utils.eval_answer_from_log import fire @@ -27,7 +28,6 @@ def print_config(*args): cfg = hydra.compose(config_name=config_name(), overrides=list(args)) debug_config(cfg, logger) - if __name__ == "__main__": install(suppress=[fire, hydra], show_locals=True) fire.Fire( @@ -35,6 +35,7 @@ def print_config(*args): "print-config": print_config, "trace": utils.trace_single_task.main, "common-benchmark": common_benchmark.main, + "run-gaia-with-monitor": run_gaia_with_monitor.main, "eval-answer": utils.eval_answer_from_log.main, "avg-score": utils.calculate_average_score.main, "score-from-log": utils.calculate_score_from_log.main, diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py new file mode 100644 index 0000000..a1e16e9 --- /dev/null +++ b/run_gaia_with_monitor.py @@ -0,0 +1,122 @@ +# SPDX-FileCopyrightText: 2025 MiromindAI +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import subprocess +import signal +import sys +import time +from typing import Optional + + +def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080): + """Run benchmark with integrated web monitoring""" + + # Validate required arguments + if not output_dir: + print("Error: output_dir is required") + print("Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name") + return 1 + + # Create output directory if it doesn't exist + os.makedirs(output_dir, exist_ok=True) + + print("=" * 50) + print("Benchmark Runner with Monitor") + print("=" * 50) + print(f"Output directory: {output_dir}") + print(f"Config name: {config_file_name}") + print(f"Web port: {web_port}") + print("=" * 50) + + # Global variables for process management + benchmark_process: Optional[subprocess.Popen] = None + monitor_process: Optional[subprocess.Popen] = None + + def cleanup_processes(): + """Clean up running processes""" + print("\nShutting down processes...") + + if benchmark_process and benchmark_process.poll() is None: + print(f"Stopping benchmark (PID: {benchmark_process.pid})...") + benchmark_process.terminate() + try: + benchmark_process.wait(timeout=5) + except subprocess.TimeoutExpired: + benchmark_process.kill() + + if monitor_process and monitor_process.poll() is None: + print(f"Stopping monitor (PID: {monitor_process.pid})...") + monitor_process.terminate() + try: + monitor_process.wait(timeout=5) + except subprocess.TimeoutExpired: + monitor_process.kill() + + print("Cleanup complete.") + + def signal_handler(signum, frame): + """Handle Ctrl+C gracefully""" + cleanup_processes() + sys.exit(0) + + # Set up signal handlers + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + # Start benchmark + print("Starting benchmark...") + benchmark_cmd = [ + "uv", "run", "main.py", "common-benchmark", + f"--config_file_name={config_file_name}", + f"output_dir={output_dir}" + ] + benchmark_process = subprocess.Popen(benchmark_cmd) + print(f"Benchmark started with PID: {benchmark_process.pid}") + + # Wait a moment for benchmark to initialize + time.sleep(3) + + # Start monitor + print("Starting web monitor...") + monitor_cmd = [ + "uv", "run", "utils/progress_check/gaia_web_monitor.py", + output_dir, + f"--web-port={web_port}" + ] + monitor_process = subprocess.Popen(monitor_cmd) + print(f"Monitor started with PID: {monitor_process.pid}") + print(f"Web dashboard available at: http://localhost:{web_port}") + + print("\n" + "=" * 50) + print("Both processes are running!") + print("Press Ctrl+C to stop both processes") + print("Monitor will continue running even if benchmark finishes") + print("=" * 50) + + # Monitor the processes + while True: + time.sleep(5) + + # Check if benchmark process is still running + if benchmark_process and benchmark_process.poll() is not None: + print("Benchmark process ended") + benchmark_process = None + + # Check if monitor process is still running + if monitor_process and monitor_process.poll() is not None: + print("Monitor process died unexpectedly. Restarting...") + monitor_process = subprocess.Popen(monitor_cmd) + print(f"Monitor restarted with PID: {monitor_process.pid}") + + except KeyboardInterrupt: + cleanup_processes() + + return 0 + + +if __name__ == "__main__": + import fire + fire.Fire(main) diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py new file mode 100644 index 0000000..e1efc5a --- /dev/null +++ b/utils/progress_check/gaia_web_monitor.py @@ -0,0 +1,549 @@ +""" +GAIA Benchmark Monitor with Web Interface + +This script provides monitoring capabilities including: +- Real-time web dashboard +- Historical data tracking + +Usage: + uv run utils/progress_check/gaia_web_monitor.py [LOG_FOLDER_PATH] [OPTIONS] + +Options: + --web-port PORT Web interface port (default: 8080) +""" + +import json +import sys +import time +import argparse +from pathlib import Path +from typing import Dict, List, Tuple, Any, Optional +from datetime import datetime, timedelta +import threading +import os +from http.server import HTTPServer, BaseHTTPRequestHandler +import urllib.parse + + +class WebDashboard: + """Simple web dashboard for monitoring""" + + def __init__(self, monitor, port: int = 8080): + self.monitor = monitor + self.port = port + self.server = None + + def start_server(self): + """Start the web server""" + handler = self.create_handler() + self.server = HTTPServer(('localhost', self.port), handler) + print(f"Web dashboard available at: http://localhost:{self.port}") + + def run_server(): + self.server.serve_forever() + + thread = threading.Thread(target=run_server, daemon=True) + thread.start() + + def create_handler(self): + """Create HTTP request handler""" + monitor = self.monitor + + class DashboardHandler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == '/': + self.send_dashboard() + elif self.path == '/api/status': + self.send_json(monitor.get_status_json()) + elif self.path == '/api/tasks': + self.send_json(monitor.get_tasks_json()) + elif self.path.startswith('/api/task-report/'): + task_id = self.path.split('/')[-1] + self.send_task_report(task_id) + else: + self.send_error(404) + + def send_dashboard(self): + self.send_response(200) + self.send_header('Content-type', 'text/html') + self.end_headers() + + html = self.generate_dashboard_html() + self.wfile.write(html.encode()) + + def send_json(self, data): + self.send_response(200) + self.send_header('Content-type', 'application/json') + self.end_headers() + self.wfile.write(json.dumps(data, default=str).encode()) + + def send_task_report(self, task_id): + """Send task report for a specific task""" + try: + # Try to find the task in the current running tasks + task_info = monitor.get_task_info(task_id) + if not task_info: + self.send_error(404, "Task not found") + return + + # Generate report using the generate_gaia_report script + report_content = monitor.generate_task_report(task_id) + if not report_content: + self.send_error(500, "Failed to generate report") + return + + self.send_response(200) + self.send_header('Content-type', 'text/plain; charset=utf-8') + self.end_headers() + self.wfile.write(report_content.encode('utf-8')) + + except Exception as e: + self.send_error(500, f"Error generating report: {str(e)}") + + def generate_dashboard_html(self): + status = monitor.get_status_json() + tasks = monitor.get_tasks_json() + + return f""" + + + + Benchmark Monitor Dashboard + + + + + + +
+

Benchmark Monitor Dashboard

+ +
+

Overall Progress

+
+
+
+

Progress: 0%

+
+ +
+

Key Metrics

+
+
0
+
Total Tasks
+
+
+
0
+
Completed
+
+
+
0
+
Running
+
+
+
0
+
Failed
+
+
+
0%
+
Accuracy
+
+
+ +
+

Recent Tasks

+ +
+ Loading... +
+
+
+ + + """ + + return DashboardHandler + + +class AdvancedBenchmarkMonitor: + """GAIA benchmark monitor with web interface""" + + def __init__(self, log_folder: str): + self.log_folder = Path(log_folder) + self.start_time = datetime.now() + # Alerts removed per user request + + # Statistics tracking + self.stats = { + "total_tasks": 0, + "completed_tasks": 0, + "running_tasks": 0, + "failed_tasks": 0, + "correct_answers": 0, + "incorrect_answers": 0, + "execution_times": [], + "error_types": {}, + "task_types": {}, + "last_update": None + } + + self.tasks = {} + self.recent_activity = [] + self._generate_gaia_report_module = None + + def _load_generate_gaia_report_module(self): + """Lazy load the generate_gaia_report module""" + if self._generate_gaia_report_module is None: + import importlib.util + spec = importlib.util.spec_from_file_location( + "generate_gaia_report", + os.path.join(os.path.dirname(__file__), "generate_gaia_report.py") + ) + if spec is None or spec.loader is None: + return None + self._generate_gaia_report_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(self._generate_gaia_report_module) + return self._generate_gaia_report_module + + def scan_log_files(self) -> List[Path]: + """Scan for all task log files""" + if not self.log_folder.exists(): + return [] + return sorted(self.log_folder.glob("task_*_attempt_*.json"), + key=lambda x: x.stat().st_mtime, reverse=True) + + def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]: + """Parse a single task log file""" + try: + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError, KeyError): + return None + + def extract_task_info(self, data: Dict[str, Any], file_path: Path) -> Dict[str, Any]: + """Extract relevant information from task data""" + task_id = data.get("task_id", "unknown") + status = data.get("status", "unknown").lower() + judge_result = data.get("judge_result", "").upper() + final_answer = data.get("final_boxed_answer", "") + error_msg = data.get("error", "") + + # Extract execution time + start_time = data.get("start_time") + end_time = data.get("end_time") + execution_time = None + + if start_time and end_time: + try: + start_dt = datetime.fromisoformat(start_time.replace('Z', '+00:00')) + end_dt = datetime.fromisoformat(end_time.replace('Z', '+00:00')) + execution_time = (end_dt - start_dt).total_seconds() + except: + pass + + # Extract task type from metadata + task_type = "" + metadata = data.get("metadata", {}) + if isinstance(metadata, dict): + # Try to get task type from various metadata fields + if "Level" in metadata: + task_type = f"Level {metadata['Level']}" + elif "task_type" in metadata: + task_type = str(metadata["task_type"]) + elif "type" in metadata: + task_type = str(metadata["type"]) + elif "difficulty" in metadata: + task_type = f"Difficulty {metadata['difficulty']}" + + return { + "task_id": task_id, + "file_path": str(file_path), + "status": status, + "judge_result": judge_result, + "final_answer": final_answer, + "error": error_msg, + "execution_time": execution_time, + "task_type": task_type, + "last_modified": file_path.stat().st_mtime + } + + def update_statistics(self, task_info: Dict[str, Any]): + """Update monitoring statistics and check for alerts""" + task_id = task_info["task_id"] + status = task_info["status"] + judge_result = task_info["judge_result"] + execution_time = task_info["execution_time"] + task_type = task_info["task_type"] + + # Update task tracking + if task_id not in self.tasks: + self.tasks[task_id] = task_info + self.stats["total_tasks"] += 1 + # Only count status for new tasks + if status == "completed": + self.stats["completed_tasks"] += 1 + if judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif status == "running": + self.stats["running_tasks"] += 1 + elif status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] += 1 + else: + # Update existing task - only update if status changed + old_status = self.tasks[task_id]["status"] + if old_status != status: + self.recent_activity.append({ + "task_id": task_id, + "old_status": old_status, + "new_status": status, + "timestamp": datetime.now() + }) + + # Decrease old status count + if old_status == "completed": + self.stats["completed_tasks"] -= 1 + old_judge_result = self.tasks[task_id]["judge_result"] + if old_judge_result == "CORRECT": + self.stats["correct_answers"] -= 1 + elif old_judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] -= 1 + elif old_status == "running": + self.stats["running_tasks"] -= 1 + elif old_status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] -= 1 + + # Increase new status count + if status == "completed": + self.stats["completed_tasks"] += 1 + if judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif status == "running": + self.stats["running_tasks"] += 1 + elif status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] += 1 + + self.tasks[task_id] = task_info + + # Track execution times + if execution_time is not None: + self.stats["execution_times"].append(execution_time) + if len(self.stats["execution_times"]) > 100: + self.stats["execution_times"] = self.stats["execution_times"][-100:] + + # Alerts removed; no checks performed + + def get_status_json(self) -> Dict[str, Any]: + """Get current status as JSON for web interface""" + total = self.stats["total_tasks"] + completed = self.stats["completed_tasks"] + running = self.stats["running_tasks"] + failed = self.stats["failed_tasks"] + + progress_pct = (completed / total * 100) if total > 0 else 0 + progress_pct = min(progress_pct, 100.0) # Cap at 100% + + total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"] + accuracy = (self.stats["correct_answers"] / total_judged * 100) if total_judged > 0 else 0 + + exec_times = self.stats["execution_times"] + avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0 + + elapsed_time = (datetime.now() - self.start_time).total_seconds() + tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0 + + return { + "total_tasks": total, + "completed_tasks": completed, + "running_tasks": running, + "failed_tasks": failed, + "progress_pct": progress_pct, + "accuracy": accuracy, + "avg_execution_time": avg_execution_time, + "tasks_per_second": tasks_per_second, + "last_update": self.stats["last_update"].isoformat() if self.stats["last_update"] else None + } + + def get_tasks_json(self) -> List[Dict[str, Any]]: + """Get tasks list as JSON for web interface""" + return [ + { + "task_id": task_info["task_id"], + "status": task_info["status"], + "judge_result": task_info["judge_result"], + "task_type": task_info["task_type"], + "execution_time": task_info["execution_time"] + } + for task_info in sorted(self.tasks.values(), key=lambda x: x["last_modified"], reverse=True) + ] + + def scan_and_update(self): + """Scan log files and update statistics""" + log_files = self.scan_log_files() + + for file_path in log_files: + data = self.parse_task_file(file_path) + if data: + task_info = self.extract_task_info(data, file_path) + self.update_statistics(task_info) + + self.stats["last_update"] = datetime.now() + + def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]: + """Get information about a specific task""" + return self.tasks.get(task_id) + + def generate_task_report(self, task_id: str) -> Optional[str]: + """Generate the original simple report (no execution details).""" + try: + # Import the original report generator (now in the same directory) + generate_module = self._load_generate_gaia_report_module() + if generate_module is None: + return None + generate_task_report = generate_module.generate_task_report + + # Map task_id to dataset index + task_index = self.find_task_index_in_dataset(task_id) + if task_index is None: + return None + + # Generate and return the plain report content + report_path = generate_task_report(task_index) + if report_path and os.path.exists(report_path): + with open(report_path, 'r', encoding='utf-8') as f: + return f.read() + return None + + except Exception as e: + print(f"Error generating simple report for task {task_id}: {e}") + return None + + + def find_task_index_in_dataset(self, task_id: str) -> Optional[int]: + """Find the index of a task in the GAIA dataset""" + try: + # Import from the same directory + generate_module = self._load_generate_gaia_report_module() + if generate_module is None: + return None + load_gaia_data = generate_module.load_gaia_data + + # Load GAIA data + tasks = load_gaia_data() + + # Find the task by ID + for i, task in enumerate(tasks): + if task.get('task_id') == task_id: + return i + + return None + + except Exception as e: + print(f"Error finding task {task_id} in dataset: {e}") + return None + + +def main(): + parser = argparse.ArgumentParser(description="GAIA Benchmark Monitor") + parser.add_argument("log_folder", nargs="?", default=".", help="Path to log folder") + parser.add_argument("--web-port", type=int, default=8080, help="Web interface port") + # Alert functionality removed; threshold flag no longer supported + + args = parser.parse_args() + + if not Path(args.log_folder).exists(): + print(f"Error: Log folder not found: {args.log_folder}") + return 1 + + # Create monitor + monitor = AdvancedBenchmarkMonitor(args.log_folder) + + # Start web dashboard + dashboard = WebDashboard(monitor, args.web_port) + dashboard.start_server() + + print("GAIA Benchmark Monitor started") + print(f"Web dashboard: http://localhost:{args.web_port}") + print("Press Ctrl+C to stop") + + try: + while True: + monitor.scan_and_update() + time.sleep(30) # Update every 30 seconds + except KeyboardInterrupt: + print("\nMonitor stopped by user") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/utils/progress_check/generate_gaia_report.py b/utils/progress_check/generate_gaia_report.py new file mode 100644 index 0000000..ce1651e --- /dev/null +++ b/utils/progress_check/generate_gaia_report.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +""" +GAIA Dataset Task Report Generator + +This script generates detailed text reports for specified tasks in the GAIA-val dataset. +""" + +import json +import os +import sys +from datetime import datetime + + +def find_gaia_data_dir(): + """Find GAIA data directory automatically""" + # Get the directory where this script is located (utils/progress_check/) + script_dir = os.path.dirname(os.path.abspath(__file__)) + # Project root is two levels up from utils/progress_check/ + repo_root = os.path.abspath(os.path.join(script_dir, "..", "..")) + + # Try common locations + possible_paths = [ + os.path.join(repo_root, "data", "gaia-val"), # Project root/data/gaia-val + os.path.join(script_dir, "..", "data", "gaia-val"), # utils/data/gaia-val (unlikely) + os.path.join(script_dir, "data", "gaia-val"), # utils/progress_check/data/gaia-val (unlikely) + "data/gaia-val", # Relative from current working directory + ] + + for path in possible_paths: + abs_path = os.path.abspath(path) + jsonl_path = os.path.join(abs_path, "standardized_data.jsonl") + if os.path.exists(jsonl_path): + return abs_path + + # If not found, return default path (project root/data/gaia-val) + return os.path.join(repo_root, "data", "gaia-val") + + +def load_gaia_data(data_dir=None): + """Load GAIA validation dataset""" + if data_dir is None: + data_dir = find_gaia_data_dir() + + jsonl_path = os.path.join(data_dir, "standardized_data.jsonl") + + if not os.path.exists(jsonl_path): + print(f"❌ Error: GAIA data file not found at {jsonl_path}") + print("Please ensure the GAIA dataset is available in one of these locations:") + print("- data/gaia-val/standardized_data.jsonl") + print("- ../data/gaia-val/standardized_data.jsonl") + print("- Or specify the correct path using --data-dir argument") + sys.exit(1) + + tasks = [] + with open(jsonl_path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + tasks.append(json.loads(line)) + + return tasks + + +def _default_reports_dir() -> str: + """Return absolute path to the default GAIA reports directory.""" + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + reports_dir = os.path.join(repo_root, "gaia_reports") + return reports_dir + + +def generate_task_report(task_index, data_dir=None, output_dir=None): + """Generate detailed text report for specified task""" + print(f"🚀 Loading GAIA dataset...") + tasks = load_gaia_data(data_dir) + + display_index = task_index + 1 + + if task_index >= len(tasks): + print(f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks") + return None + + print(f"📄 Generating task {display_index} report...") + + # Get task data + task = tasks[task_index] + + # Set output directory (default to /gaia_reports) + if output_dir is None: + output_dir = _default_reports_dir() + + # Ensure the directory exists + os.makedirs(output_dir, exist_ok=True) + + # Generate report file + report_path = os.path.join(output_dir, f'gaia_task_{display_index}_report.txt') + + with open(report_path, 'w', encoding='utf-8') as f: + f.write("=" * 80 + "\n") + f.write(f"GAIA Dataset Task {display_index} Detailed Report\n") + f.write("=" * 80 + "\n\n") + + # Basic information + f.write("1. Task Basic Information\n") + f.write("-" * 40 + "\n") + f.write(f"Task ID: {task['task_id']}\n") + f.write(f"Difficulty Level: Level {task['metadata']['Level']}\n") + f.write(f"File Attachment: {'Yes' if task.get('file_path') else 'No'}\n") + if task.get('file_path'): + f.write(f"File Path: {task['file_path']}\n") + f.write("\n") + + # Question content + f.write("2. Question Content\n") + f.write("-" * 40 + "\n") + f.write(f"{task['task_question']}\n\n") + + # Ground truth answer + f.write("3. Ground Truth Answer\n") + f.write("-" * 40 + "\n") + f.write(f"{task['ground_truth']}\n\n") + + # Solution steps + f.write("4. Detailed Solution Steps\n") + f.write("-" * 40 + "\n") + f.write(f"{task['metadata']['Annotator Metadata']['Steps']}\n\n") + + # Metadata + f.write("5. Task Metadata\n") + f.write("-" * 40 + "\n") + metadata = task['metadata']['Annotator Metadata'] + for key, value in metadata.items(): + if key != 'Steps': # Skip Steps since it's shown in section 4 + if key == 'Tools': + f.write(f"{key}:\n{value}\n\n") + else: + f.write(f"{key}: {value}\n\n") + f.write("\n") + + f.write("=" * 80 + "\n") + f.write("End of Report\n") + f.write("=" * 80 + "\n") + + print(f"📄 Task {display_index} detailed report saved to: {report_path}") + + return report_path + + +def main(): + """Main function""" + import argparse + + parser = argparse.ArgumentParser(description='Generate GAIA dataset task reports') + parser.add_argument('task_index', nargs='?', type=int, default=1, + help='Task index to generate report for (1-based, default: 1)') + parser.add_argument('--data-dir', type=str, default=None, + help='Path to GAIA data directory (auto-detected if not specified)') + parser.add_argument('--output-dir', type=str, default=None, + help='Output directory for reports (default: /gaia_reports)') + + args = parser.parse_args() + + task_index = args.task_index - 1 # Convert to 0-based for internal use + + generate_task_report(task_index, args.data_dir, args.output_dir) + + +if __name__ == "__main__": + main() + From a1775236505e3188434b72aa0268438797ecf9d0 Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sat, 1 Nov 2025 17:01:32 +0000 Subject: [PATCH 03/11] spacing --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index c0b232c..b7260e5 100644 --- a/main.py +++ b/main.py @@ -28,6 +28,7 @@ def print_config(*args): cfg = hydra.compose(config_name=config_name(), overrides=list(args)) debug_config(cfg, logger) + if __name__ == "__main__": install(suppress=[fire, hydra], show_locals=True) fire.Fire( From 33c96d24887e77e488b0090fdce50b96f151d12e Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sat, 1 Nov 2025 20:13:10 +0000 Subject: [PATCH 04/11] Remove guide.md from tracking --- guide.md | 43 ------------------------------------------- 1 file changed, 43 deletions(-) delete mode 100644 guide.md diff --git a/guide.md b/guide.md deleted file mode 100644 index 51082bb..0000000 --- a/guide.md +++ /dev/null @@ -1,43 +0,0 @@ -## Git 管理 -- 开源项目的工作流一般都是 fork + 提PR -- 一个分支本质上是代码历史中指向某个commit的指针 -- feature/monitor branch: 跑benchmark的时候监控中间过程 - -## Reproducing GAIA Validation Benchmark Results - -**Prepara GAIA vaidation dataset:** -```bash -cd data -wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip -unzip gaia-val.zip -# Unzip passcode: pf4* -``` - -**API key configuration:** -```bash -touch .env -nano .env -``` - -**Run GAIA validation with Claude 3.7 Sonnet** - -```bash -uv run main.py common-benchmark \ - --config_file_name=agent_gaia-validation_claude37sonnet \ - output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" - ``` - -**Checking progress:** -```bash -uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG -``` - -**Resume running if interrupted:** -```bash -uv run main.py common-benchmark \ - --config_file_name=agent_gaia-validation_claude37sonnet.yaml \ - output_dir="$PATH_TO_LOG" -``` - - -## Other Benchmark Datasets \ No newline at end of file From fc18fef3e292ef082196f0fcb66cf8298f0b6405 Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sat, 1 Nov 2025 20:15:24 +0000 Subject: [PATCH 05/11] add guide.md --- guide.md | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 guide.md diff --git a/guide.md b/guide.md new file mode 100644 index 0000000..fc47dde --- /dev/null +++ b/guide.md @@ -0,0 +1,107 @@ +## Git 管理 +- 开源项目的工作流一般都是 fork + 提PR +- 一个分支本质上是代码历史中指向某个commit的指针 +- feature/monitor branch: 跑benchmark的时候监控中间过程 +- upstream + +流程:fork repo -> 创建feature branch -> 提PR -> 通过后merge到自己repo的main + +## 简便设置 +Edit bash files and python files to run monitoring easily + +## Reproducing GAIA Validation Benchmark Results + +**Prepara GAIA vaidation dataset:** +```bash +cd data +wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip +unzip gaia-val.zip +# Unzip passcode: pf4* +``` + +**API key configuration:** +```bash +touch .env +nano .env +``` + +**Run GAIA validation with Claude 3.7 Sonnet** + +```bash +uv run main.py common-benchmark \ + --config_file_name=agent_gaia-validation_claude37sonnet \ + output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" + ``` + +**Run GAIA validation with integrated web monitoring:** + +```bash +uv run main.py run-gaia-with-monitor \ + --config_file_name=agent_gaia-validation_claude37sonnet \ + --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" +``` + +This will start both the benchmark and a web dashboard at http://localhost:8080 for real-time monitoring. + +**Alternative: Using the shell script:** + +```bash +./utils/progress_check/run_with_monitor.sh "logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" +``` + +**Checking progress:** +```bash +uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG +``` + +**Start monitoring for existing logs:** +```bash +./utils/progress_check/run_with_monitor.sh --monitor-only $PATH_TO_LOG +``` + +**Resume running if interrupted:** +```bash +uv run main.py common-benchmark \ + --config_file_name=agent_gaia-validation_claude37sonnet \ + output_dir="$PATH_TO_LOG" +``` + +## Visualization (gaia-val) +```bash +uv run utils/progress_check/generate_gaia_report.py +``` + +## Other Benchmark Datasets +Prepare dataset: +```bash +uv run prepare-benchmark get futurex # etc +``` + +Run benchmark +```bash +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" +``` + +Check progress while running +```bash +uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG +``` + +Resume interrupted evaluation +```bash +uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="$PATH_TO_LOG" +``` + +## Run/resume GAIA-val with web monitor +```bash +uv run main.py run-gaia-with-monitor \ +--config_file_name=agent_gaia-validation_claude37sonnet \ +--output_dir="$PATH_TO_LOG" +``` + +related files: +- `main.py` +- `run_gaia_with_monitor.py` +- `utils/progress_check/generate_gaia_report.py` +- `utils/progress_check/gaia_web_monitor.py` + From 3507659e6998e9e2e7f0a6fc310998089cc01a78 Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sat, 1 Nov 2025 23:41:53 +0000 Subject: [PATCH 06/11] Remove guide.md --- guide.md | 107 ------------------------------------------------------- 1 file changed, 107 deletions(-) delete mode 100644 guide.md diff --git a/guide.md b/guide.md deleted file mode 100644 index fc47dde..0000000 --- a/guide.md +++ /dev/null @@ -1,107 +0,0 @@ -## Git 管理 -- 开源项目的工作流一般都是 fork + 提PR -- 一个分支本质上是代码历史中指向某个commit的指针 -- feature/monitor branch: 跑benchmark的时候监控中间过程 -- upstream - -流程:fork repo -> 创建feature branch -> 提PR -> 通过后merge到自己repo的main - -## 简便设置 -Edit bash files and python files to run monitoring easily - -## Reproducing GAIA Validation Benchmark Results - -**Prepara GAIA vaidation dataset:** -```bash -cd data -wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip -unzip gaia-val.zip -# Unzip passcode: pf4* -``` - -**API key configuration:** -```bash -touch .env -nano .env -``` - -**Run GAIA validation with Claude 3.7 Sonnet** - -```bash -uv run main.py common-benchmark \ - --config_file_name=agent_gaia-validation_claude37sonnet \ - output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" - ``` - -**Run GAIA validation with integrated web monitoring:** - -```bash -uv run main.py run-gaia-with-monitor \ - --config_file_name=agent_gaia-validation_claude37sonnet \ - --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" -``` - -This will start both the benchmark and a web dashboard at http://localhost:8080 for real-time monitoring. - -**Alternative: Using the shell script:** - -```bash -./utils/progress_check/run_with_monitor.sh "logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")" -``` - -**Checking progress:** -```bash -uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG -``` - -**Start monitoring for existing logs:** -```bash -./utils/progress_check/run_with_monitor.sh --monitor-only $PATH_TO_LOG -``` - -**Resume running if interrupted:** -```bash -uv run main.py common-benchmark \ - --config_file_name=agent_gaia-validation_claude37sonnet \ - output_dir="$PATH_TO_LOG" -``` - -## Visualization (gaia-val) -```bash -uv run utils/progress_check/generate_gaia_report.py -``` - -## Other Benchmark Datasets -Prepare dataset: -```bash -uv run prepare-benchmark get futurex # etc -``` - -Run benchmark -```bash -uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")" -``` - -Check progress while running -```bash -uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG -``` - -Resume interrupted evaluation -```bash -uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="$PATH_TO_LOG" -``` - -## Run/resume GAIA-val with web monitor -```bash -uv run main.py run-gaia-with-monitor \ ---config_file_name=agent_gaia-validation_claude37sonnet \ ---output_dir="$PATH_TO_LOG" -``` - -related files: -- `main.py` -- `run_gaia_with_monitor.py` -- `utils/progress_check/generate_gaia_report.py` -- `utils/progress_check/gaia_web_monitor.py` - From cb853d639d35eecd7deb84365b8bc3cfc2f92cff Mon Sep 17 00:00:00 2001 From: JoeXic Date: Sun, 2 Nov 2025 00:19:23 +0000 Subject: [PATCH 07/11] style: format code with ruff --- run_gaia_with_monitor.py | 56 ++-- utils/progress_check/gaia_web_monitor.py | 271 ++++++++++--------- utils/progress_check/generate_gaia_report.py | 59 ++-- 3 files changed, 210 insertions(+), 176 deletions(-) diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py index a1e16e9..694fcb4 100644 --- a/run_gaia_with_monitor.py +++ b/run_gaia_with_monitor.py @@ -12,16 +12,18 @@ def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080): """Run benchmark with integrated web monitoring""" - + # Validate required arguments if not output_dir: print("Error: output_dir is required") - print("Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name") + print( + "Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name" + ) return 1 - + # Create output directory if it doesn't exist os.makedirs(output_dir, exist_ok=True) - + print("=" * 50) print("Benchmark Runner with Monitor") print("=" * 50) @@ -29,15 +31,15 @@ def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int print(f"Config name: {config_file_name}") print(f"Web port: {web_port}") print("=" * 50) - + # Global variables for process management benchmark_process: Optional[subprocess.Popen] = None monitor_process: Optional[subprocess.Popen] = None - + def cleanup_processes(): """Clean up running processes""" print("\nShutting down processes...") - + if benchmark_process and benchmark_process.poll() is None: print(f"Stopping benchmark (PID: {benchmark_process.pid})...") benchmark_process.terminate() @@ -45,7 +47,7 @@ def cleanup_processes(): benchmark_process.wait(timeout=5) except subprocess.TimeoutExpired: benchmark_process.kill() - + if monitor_process and monitor_process.poll() is None: print(f"Stopping monitor (PID: {monitor_process.pid})...") monitor_process.terminate() @@ -53,70 +55,76 @@ def cleanup_processes(): monitor_process.wait(timeout=5) except subprocess.TimeoutExpired: monitor_process.kill() - + print("Cleanup complete.") - + def signal_handler(signum, frame): """Handle Ctrl+C gracefully""" cleanup_processes() sys.exit(0) - + # Set up signal handlers signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - + try: # Start benchmark print("Starting benchmark...") benchmark_cmd = [ - "uv", "run", "main.py", "common-benchmark", + "uv", + "run", + "main.py", + "common-benchmark", f"--config_file_name={config_file_name}", - f"output_dir={output_dir}" + f"output_dir={output_dir}", ] benchmark_process = subprocess.Popen(benchmark_cmd) print(f"Benchmark started with PID: {benchmark_process.pid}") - + # Wait a moment for benchmark to initialize time.sleep(3) - + # Start monitor print("Starting web monitor...") monitor_cmd = [ - "uv", "run", "utils/progress_check/gaia_web_monitor.py", + "uv", + "run", + "utils/progress_check/gaia_web_monitor.py", output_dir, - f"--web-port={web_port}" + f"--web-port={web_port}", ] monitor_process = subprocess.Popen(monitor_cmd) print(f"Monitor started with PID: {monitor_process.pid}") print(f"Web dashboard available at: http://localhost:{web_port}") - + print("\n" + "=" * 50) print("Both processes are running!") print("Press Ctrl+C to stop both processes") print("Monitor will continue running even if benchmark finishes") print("=" * 50) - + # Monitor the processes while True: time.sleep(5) - + # Check if benchmark process is still running if benchmark_process and benchmark_process.poll() is not None: print("Benchmark process ended") benchmark_process = None - + # Check if monitor process is still running if monitor_process and monitor_process.poll() is not None: print("Monitor process died unexpectedly. Restarting...") monitor_process = subprocess.Popen(monitor_cmd) print(f"Monitor restarted with PID: {monitor_process.pid}") - + except KeyboardInterrupt: cleanup_processes() - + return 0 if __name__ == "__main__": import fire + fire.Fire(main) diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py index e1efc5a..6e43c3b 100644 --- a/utils/progress_check/gaia_web_monitor.py +++ b/utils/progress_check/gaia_web_monitor.py @@ -13,70 +13,68 @@ """ import json -import sys import time import argparse from pathlib import Path -from typing import Dict, List, Tuple, Any, Optional -from datetime import datetime, timedelta +from typing import Dict, List, Any, Optional +from datetime import datetime import threading import os from http.server import HTTPServer, BaseHTTPRequestHandler -import urllib.parse class WebDashboard: """Simple web dashboard for monitoring""" - + def __init__(self, monitor, port: int = 8080): self.monitor = monitor self.port = port self.server = None - + def start_server(self): """Start the web server""" handler = self.create_handler() - self.server = HTTPServer(('localhost', self.port), handler) + self.server = HTTPServer(("localhost", self.port), handler) print(f"Web dashboard available at: http://localhost:{self.port}") - + def run_server(): self.server.serve_forever() - + thread = threading.Thread(target=run_server, daemon=True) thread.start() - + def create_handler(self): """Create HTTP request handler""" monitor = self.monitor - + class DashboardHandler(BaseHTTPRequestHandler): def do_GET(self): - if self.path == '/': + if self.path == "/": self.send_dashboard() - elif self.path == '/api/status': + elif self.path == "/api/status": self.send_json(monitor.get_status_json()) - elif self.path == '/api/tasks': + elif self.path == "/api/tasks": self.send_json(monitor.get_tasks_json()) - elif self.path.startswith('/api/task-report/'): - task_id = self.path.split('/')[-1] + elif self.path.startswith("/api/task-report/"): + task_id = self.path.split("/")[-1] self.send_task_report(task_id) else: self.send_error(404) - + def send_dashboard(self): self.send_response(200) - self.send_header('Content-type', 'text/html') + self.send_header("Content-type", "text/html") self.end_headers() - + html = self.generate_dashboard_html() self.wfile.write(html.encode()) - + def send_json(self, data): self.send_response(200) - self.send_header('Content-type', 'application/json') + self.send_header("Content-type", "application/json") self.end_headers() self.wfile.write(json.dumps(data, default=str).encode()) - + def send_task_report(self, task_id): """Send task report for a specific task""" try: @@ -85,26 +83,23 @@ def send_task_report(self, task_id): if not task_info: self.send_error(404, "Task not found") return - + # Generate report using the generate_gaia_report script report_content = monitor.generate_task_report(task_id) if not report_content: self.send_error(500, "Failed to generate report") return - + self.send_response(200) - self.send_header('Content-type', 'text/plain; charset=utf-8') + self.send_header("Content-type", "text/plain; charset=utf-8") self.end_headers() - self.wfile.write(report_content.encode('utf-8')) - + self.wfile.write(report_content.encode("utf-8")) + except Exception as e: self.send_error(500, f"Error generating report: {str(e)}") - + def generate_dashboard_html(self): - status = monitor.get_status_json() - tasks = monitor.get_tasks_json() - - return f""" + return """ @@ -112,26 +107,26 @@ def generate_dashboard_html(self): + + +
+

{benchmark_name} Monitor Dashboard

+ +
+

Overall Progress

+
+
+
+

Progress: 0%

+
+ +
+

Key Metrics

+
+
+
0
+
Total Tasks
+
+
+
0
+
Completed
+
+
+
0
+
Running
+
+
+
0
+
Failed
+
+
+
+ +
+

Recent Tasks

+ +
+ Loading... +
+
+
+ + + """ + + return DashboardHandler + + +class BenchmarkMonitor: + """Generic benchmark monitor with web interface""" + + def __init__(self, log_folder: str): + self.log_folder = Path(log_folder) + self.start_time = datetime.now() + self.benchmark_name = self._detect_benchmark_name() + self.benchmark_type = self._detect_benchmark_type() + + # Initialize statistics based on benchmark type + self.stats = self._initialize_stats() + + self.tasks = {} + self.recent_activity = [] + + def _detect_benchmark_name(self) -> str: + """Detect benchmark name from log folder path or config file""" + # Try to get from .hydra/config.yaml first + hydra_config_path = self.log_folder / ".hydra" / "config.yaml" + if hydra_config_path.exists(): + try: + cfg = OmegaConf.load(hydra_config_path) + benchmark_name = cfg.get("benchmark", {}).get("name", "") + if benchmark_name: + return self._format_benchmark_name(benchmark_name) + except Exception: + pass + + # Try to extract from path (e.g., logs/gaia/... -> GAIA) + path_parts = self.log_folder.parts + if "logs" in path_parts: + idx = path_parts.index("logs") + if idx + 1 < len(path_parts): + benchmark_name = path_parts[idx + 1] + return self._format_benchmark_name(benchmark_name) + + # Default fallback + return "Benchmark" + + def _format_benchmark_name(self, name: str) -> str: + """Format benchmark name to a friendly display format""" + name_lower = name.lower().replace("-", "").replace("_", "") + + # Map common benchmark names to their preferred display format + name_mapping = { + "finsearchcomp": "FinSearchComp", + "futurex": "FutureX", + "future-x": "FutureX", + "gaia": "GAIA", + "xbench": "xbench", + "x-bench": "xbench", + "browsecomp": "BrowseComp", + "browsecomp-zh": "BrowseComp-ZH", + } + + # Check exact match first + if name_lower in name_mapping: + return name_mapping[name_lower] + + # Check partial match (e.g., "finsearchcomp-claude" -> "FinSearchComp") + for key, value in name_mapping.items(): + if name_lower.startswith(key): + return value + + # Default: convert to title case (e.g., "example_dataset" -> "Example Dataset") + return name.replace("-", " ").replace("_", " ").title() + + def _detect_benchmark_type(self) -> str: + """Detect benchmark type to determine statistics logic""" + name_lower = self.benchmark_name.lower() + + if "gaia" in name_lower: + return "gaia" # Has ground truth, needs correctness evaluation + elif "futurex" in name_lower or "future-x" in name_lower: + return "futurex" # No ground truth, prediction-focused + elif "xbench" in name_lower or "x-bench" in name_lower: + return "xbench" # No ground truth, prediction-focused + elif "finsearchcomp" in name_lower or "finsearch-comp" in name_lower: + return "finsearchcomp" # Has ground truth, needs task type breakdown + else: + return "default" # Default: assume has ground truth + + def _initialize_stats(self) -> Dict[str, Any]: + """Initialize statistics based on benchmark type""" + base_stats = { + "total_tasks": 0, + "completed_tasks": 0, + "running_tasks": 0, + "failed_tasks": 0, + "execution_times": [], + "error_types": {}, + "task_types": {}, + "last_update": None, + } + + if self.benchmark_type == "gaia": + # GAIA: correctness evaluation + base_stats.update({ + "correct_answers": 0, + "incorrect_answers": 0, + }) + elif self.benchmark_type in ["futurex", "xbench"]: + # FutureX/xbench: prediction-focused + base_stats.update({ + "with_predictions": 0, + "without_predictions": 0, + "with_errors": 0, + }) + elif self.benchmark_type == "finsearchcomp": + # FinSearchComp: task type and regional breakdown (like check_finsearchcomp_progress.py) + base_stats.update({ + "correct_answers": 0, # T2+T3 only + "incorrect_answers": 0, # T2+T3 only + "task_type_breakdown": { + "T1": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + }, + "regional_breakdown": { + "Global": { + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + }, + "Greater China": { + "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0}, + }, + }, + }) + else: + # Default: assume has ground truth + base_stats.update({ + "correct_answers": 0, + "incorrect_answers": 0, + }) + + return base_stats + + def scan_log_files(self) -> List[Path]: + """Scan for all task log files""" + if not self.log_folder.exists(): + return [] + return sorted( + self.log_folder.glob("task_*_attempt_*.json"), + key=lambda x: x.stat().st_mtime, + reverse=True, + ) + + def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]: + """Parse a single task log file""" + try: + with open(file_path, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError, KeyError): + return None + + def extract_task_info( + self, data: Dict[str, Any], file_path: Path + ) -> Dict[str, Any]: + """Extract relevant information from task data""" + task_id = data.get("task_id", "unknown") + status = data.get("status", "unknown").lower() + judge_result = data.get("judge_result", "").upper() + final_answer = data.get("final_boxed_answer", "") + error_msg = data.get("error", "") + + # Extract attempt number from filename (e.g., task_xxx_attempt_1.json -> 1) + attempt = 1 # Default + match = re.search(r"_attempt_(\d+)\.json$", str(file_path)) + if match: + attempt = int(match.group(1)) + + # Extract execution time + start_time = data.get("start_time") + end_time = data.get("end_time") + execution_time = None + + if start_time and end_time: + try: + start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00")) + end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00")) + execution_time = (end_dt - start_dt).total_seconds() + except Exception: + pass + + # Extract task type from metadata or task_id + task_type = "" + metadata = data.get("metadata", {}) + if isinstance(metadata, dict): + # Try to get task type from various metadata fields + if "Level" in metadata: + task_type = f"Level {metadata['Level']}" + elif "task_type" in metadata: + task_type = str(metadata["task_type"]) + elif "type" in metadata: + task_type = str(metadata["type"]) + elif "difficulty" in metadata: + task_type = f"Difficulty {metadata['difficulty']}" + + # For FinSearchComp, extract task type from task_id (e.g., "(T1)Time_Sensitive_Data_Fetching_006") + if self.benchmark_type == "finsearchcomp" and not task_type: + match = re.match(r"^\(T(\d+)\)", task_id) + if match: + task_type = f"T{match.group(1)}" + + # Extract region for FinSearchComp + region = "" + if self.benchmark_type == "finsearchcomp": + label = data.get("input", {}).get("metadata", {}).get("label", "") + if "(Global)" in label: + region = "Global" + elif "(Greater China)" in label: + region = "Greater China" + + return { + "task_id": task_id, + "file_path": str(file_path), + "status": status, + "judge_result": judge_result, + "final_answer": final_answer, + "error": error_msg, + "execution_time": execution_time, + "task_type": task_type, + "region": region, + "attempt": attempt, + "last_modified": file_path.stat().st_mtime, + } + + def update_statistics(self, task_info: Dict[str, Any]): + """Update monitoring statistics based on benchmark type""" + task_id = task_info["task_id"] + status = task_info["status"] + judge_result = task_info["judge_result"] + execution_time = task_info["execution_time"] + final_answer = task_info.get("final_answer", "") + error_msg = task_info.get("error", "") + task_type = task_info.get("task_type", "") + + # Update task tracking + if task_id not in self.tasks: + self.tasks[task_id] = task_info + self.stats["total_tasks"] += 1 + region = task_info.get("region", "") + self._update_stats_for_new_task(status, judge_result, final_answer, error_msg, task_type, region) + else: + # Update existing task - only update if status changed + old_status = self.tasks[task_id]["status"] + if old_status != status: + self.recent_activity.append( + { + "task_id": task_id, + "old_status": old_status, + "new_status": status, + "timestamp": datetime.now(), + } + ) + old_region = self.tasks[task_id].get("region", "") + new_region = task_info.get("region", "") + self._update_stats_for_status_change( + old_status, status, + self.tasks[task_id].get("judge_result", ""), + judge_result, + self.tasks[task_id].get("final_answer", ""), + final_answer, + self.tasks[task_id].get("error", ""), + error_msg, + task_type, + old_region, + new_region + ) + self.tasks[task_id] = task_info + + # Track execution times + if execution_time is not None: + self.stats["execution_times"].append(execution_time) + if len(self.stats["execution_times"]) > 100: + self.stats["execution_times"] = self.stats["execution_times"][-100:] + + def _update_stats_for_new_task(self, status: str, judge_result: str, + final_answer: str, error_msg: str, task_type: str, region: str = ""): + """Update statistics for a new task based on benchmark type (like check_finsearchcomp_progress.py)""" + if status == "completed": + self.stats["completed_tasks"] += 1 + + if self.benchmark_type == "gaia": + if judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif self.benchmark_type in ["futurex", "xbench"]: + # For xbench/futurex: count predictions for all tasks (like check_xbench_progress.py) + # But prediction_rate is calculated as with_predictions / completed + pass # Predictions and errors are counted below for all statuses + elif self.benchmark_type == "finsearchcomp": + if task_type in ["T1", "T2", "T3", "Unknown"]: + self.stats["task_type_breakdown"][task_type]["completed"] += 1 + + # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py) + # T1 tasks are considered "completed" but not evaluated for correctness due to outdated ground truth + if task_type == "T1": + pass # T1 tasks are excluded from correctness evaluation + elif task_type in ["T2", "T3"]: + # For T2 and T3 tasks, evaluate correctness (like check_finsearchcomp_progress.py) + # If judge_result is CORRECT, count as correct; otherwise (including NOT_ATTEMPTED) count as incorrect + if judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + self.stats["task_type_breakdown"][task_type]["correct"] += 1 + # Update regional breakdown for correct T2 and T3 tasks + if region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][region][task_type]["correct"] += 1 + else: + # All non-CORRECT results (including NOT_ATTEMPTED, INCORRECT, ERROR) count as incorrect + self.stats["incorrect_answers"] += 1 + self.stats["task_type_breakdown"][task_type]["incorrect"] += 1 + # Update regional breakdown for incorrect T2 and T3 tasks + if region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][region][task_type]["incorrect"] += 1 + else: # default + if judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif status == "running": + self.stats["running_tasks"] += 1 + elif status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] += 1 + + # For xbench/futurex: count predictions and errors for ALL tasks (like check_xbench_progress.py) + if self.benchmark_type in ["futurex", "xbench"]: + if final_answer and final_answer.strip(): + self.stats["with_predictions"] += 1 + else: + self.stats["without_predictions"] += 1 + if error_msg and error_msg.strip(): + self.stats["with_errors"] += 1 + + # Update task type breakdown for FinSearchComp + if self.benchmark_type == "finsearchcomp" and task_type: + if task_type in ["T1", "T2", "T3", "Unknown"]: + self.stats["task_type_breakdown"][task_type]["total"] += 1 + # Update regional breakdown for T2 and T3 tasks + if task_type in ["T2", "T3"] and region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][region][task_type]["total"] += 1 + if status == "completed": + self.stats["regional_breakdown"][region][task_type]["completed"] += 1 + + def _update_stats_for_status_change(self, old_status: str, new_status: str, + old_judge_result: str, new_judge_result: str, + old_final_answer: str, new_final_answer: str, + old_error: str, new_error: str, + task_type: str, old_region: str = "", new_region: str = ""): + """Update statistics when task status changes""" + # Decrease old status count + if old_status == "completed": + self.stats["completed_tasks"] -= 1 + if self.benchmark_type == "gaia": + if old_judge_result == "CORRECT": + self.stats["correct_answers"] -= 1 + elif old_judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] -= 1 + elif self.benchmark_type in ["futurex", "xbench"]: + # Predictions and errors are updated below for all statuses + pass + elif self.benchmark_type == "finsearchcomp": + if task_type in ["T1", "T2", "T3", "Unknown"]: + self.stats["task_type_breakdown"][task_type]["completed"] -= 1 + # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py) + if task_type == "T1": + pass # T1 tasks are excluded from correctness evaluation + elif task_type in ["T2", "T3"]: + # Like check_finsearchcomp_progress.py: if CORRECT, count as correct; otherwise as incorrect + if old_judge_result == "CORRECT": + self.stats["correct_answers"] -= 1 + self.stats["task_type_breakdown"][task_type]["correct"] -= 1 + # Update regional breakdown for correct T2 and T3 tasks + if old_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][old_region][task_type]["correct"] -= 1 + else: + # All non-CORRECT results count as incorrect + self.stats["incorrect_answers"] -= 1 + self.stats["task_type_breakdown"][task_type]["incorrect"] -= 1 + # Update regional breakdown for incorrect T2 and T3 tasks + if old_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][old_region][task_type]["incorrect"] -= 1 + # Update regional breakdown for completed T2 and T3 tasks + if old_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][old_region][task_type]["completed"] -= 1 + else: # default + if old_judge_result == "CORRECT": + self.stats["correct_answers"] -= 1 + elif old_judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] -= 1 + elif old_status == "running": + self.stats["running_tasks"] -= 1 + elif old_status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] -= 1 + + # Increase new status count + if new_status == "completed": + self.stats["completed_tasks"] += 1 + if self.benchmark_type == "gaia": + if new_judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif new_judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif self.benchmark_type in ["futurex", "xbench"]: + # Predictions and errors are updated below for all statuses + pass + elif self.benchmark_type == "finsearchcomp": + if task_type in ["T1", "T2", "T3", "Unknown"]: + self.stats["task_type_breakdown"][task_type]["completed"] += 1 + + # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py) + # T1 tasks are considered "completed" but not evaluated for correctness due to outdated ground truth + if task_type == "T1": + pass # T1 tasks are excluded from correctness evaluation + elif task_type in ["T2", "T3"]: + # For T2 and T3 tasks, evaluate correctness (like check_finsearchcomp_progress.py) + # If judge_result is CORRECT, count as correct; otherwise (including NOT_ATTEMPTED) count as incorrect + if new_judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + self.stats["task_type_breakdown"][task_type]["correct"] += 1 + # Update regional breakdown for correct T2 and T3 tasks + if new_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][new_region][task_type]["correct"] += 1 + else: + # All non-CORRECT results (including NOT_ATTEMPTED, INCORRECT, ERROR) count as incorrect + self.stats["incorrect_answers"] += 1 + self.stats["task_type_breakdown"][task_type]["incorrect"] += 1 + # Update regional breakdown for incorrect T2 and T3 tasks + if new_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][new_region][task_type]["incorrect"] += 1 + # Update regional breakdown for completed T2 and T3 tasks + if new_region in ["Global", "Greater China"]: + self.stats["regional_breakdown"][new_region][task_type]["completed"] += 1 + else: # default + if new_judge_result == "CORRECT": + self.stats["correct_answers"] += 1 + elif new_judge_result in ["INCORRECT", "ERROR"]: + self.stats["incorrect_answers"] += 1 + elif new_status == "running": + self.stats["running_tasks"] += 1 + elif new_status in ["failed", "error", "interrupted"]: + self.stats["failed_tasks"] += 1 + + # For xbench/futurex: update predictions and errors for ALL statuses (like check_xbench_progress.py) + if self.benchmark_type in ["futurex", "xbench"]: + # Decrease old counts + if old_final_answer and old_final_answer.strip(): + self.stats["with_predictions"] -= 1 + else: + self.stats["without_predictions"] -= 1 + if old_error and old_error.strip(): + self.stats["with_errors"] -= 1 + + # Increase new counts + if new_final_answer and new_final_answer.strip(): + self.stats["with_predictions"] += 1 + else: + self.stats["without_predictions"] += 1 + if new_error and new_error.strip(): + self.stats["with_errors"] += 1 + + def get_status_json(self) -> Dict[str, Any]: + """Get current status as JSON for web interface, based on benchmark type""" + total = self.stats["total_tasks"] + completed = self.stats["completed_tasks"] + running = self.stats["running_tasks"] + failed = self.stats["failed_tasks"] + + progress_pct = (completed / total * 100) if total > 0 else 0 + progress_pct = min(progress_pct, 100.0) # Cap at 100% + + exec_times = self.stats["execution_times"] + avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0 + + elapsed_time = (datetime.now() - self.start_time).total_seconds() + tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0 + + result = { + "total_tasks": total, + "completed_tasks": completed, + "running_tasks": running, + "failed_tasks": failed, + "progress_pct": progress_pct, + "avg_execution_time": avg_execution_time, + "tasks_per_second": tasks_per_second, + "benchmark_type": self.benchmark_type, + "last_update": self.stats["last_update"].isoformat() + if self.stats["last_update"] + else None, + } + + # Add type-specific metrics + if self.benchmark_type == "gaia": + total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"] + accuracy = ( + (self.stats["correct_answers"] / total_judged * 100) + if total_judged > 0 + else 0 + ) + result.update({ + "correct_answers": self.stats["correct_answers"], + "incorrect_answers": self.stats["incorrect_answers"], + "accuracy": accuracy, + }) + elif self.benchmark_type in ["futurex", "xbench"]: + prediction_rate = ( + (self.stats["with_predictions"] / completed * 100) + if completed > 0 + else 0 + ) + result.update({ + "with_predictions": self.stats["with_predictions"], + "without_predictions": self.stats["without_predictions"], + "with_errors": self.stats["with_errors"], + "prediction_rate": prediction_rate, + }) + elif self.benchmark_type == "finsearchcomp": + t2_t3_completed = ( + self.stats["task_type_breakdown"]["T2"]["completed"] + + self.stats["task_type_breakdown"]["T3"]["completed"] + ) + t2_t3_correct = ( + self.stats["task_type_breakdown"]["T2"]["correct"] + + self.stats["task_type_breakdown"]["T3"]["correct"] + ) + accuracy = ( + (t2_t3_correct / t2_t3_completed * 100) + if t2_t3_completed > 0 + else 0 + ) + result.update({ + "correct_answers": self.stats["correct_answers"], # T2+T3 only + "incorrect_answers": self.stats["incorrect_answers"], # T2+T3 only + "accuracy": accuracy, # T2+T3 accuracy + "task_type_breakdown": self.stats["task_type_breakdown"], + "regional_breakdown": self.stats["regional_breakdown"], # Like check_finsearchcomp_progress.py + "t1_completed": self.stats["task_type_breakdown"]["T1"]["completed"], + }) + else: # default + total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"] + accuracy = ( + (self.stats["correct_answers"] / total_judged * 100) + if total_judged > 0 + else 0 + ) + result.update({ + "correct_answers": self.stats["correct_answers"], + "incorrect_answers": self.stats["incorrect_answers"], + "accuracy": accuracy, + }) + + return result + + def get_tasks_json(self) -> List[Dict[str, Any]]: + """Get tasks list as JSON for web interface""" + tasks_list = [] + for task_info in sorted( + self.tasks.values(), key=lambda x: x["last_modified"], reverse=True + ): + # For FutureX/xbench, don't include judge_result (like check_futurex_progress.py, check_xbench_progress.py) + task_dict = { + "task_id": task_info["task_id"], + "status": task_info["status"], + "task_type": task_info["task_type"], + "execution_time": task_info["execution_time"], + } + + # Exclude judge_result for FutureX and xbench (like check_futurex_progress.py, check_xbench_progress.py) + if self.benchmark_type not in ["futurex", "xbench"]: + task_dict["judge_result"] = task_info["judge_result"] + else: + # For FutureX/xbench, include final_answer instead (for display purposes) + task_dict["final_answer"] = task_info.get("final_answer", "") + + tasks_list.append(task_dict) + + return tasks_list + + def scan_and_update(self): + """Scan log files and update statistics""" + log_files = self.scan_log_files() + + for file_path in log_files: + data = self.parse_task_file(file_path) + if data: + task_info = self.extract_task_info(data, file_path) + self.update_statistics(task_info) + + self.stats["last_update"] = datetime.now() + + def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]: + """Get information about a specific task""" + return self.tasks.get(task_id) + + def generate_task_report(self, task_id: str) -> Optional[str]: + """Generate report by calling the standalone report generator""" + try: + # Get task info to extract attempt number + task_info = self.get_task_info(task_id) + if not task_info: + return f"Error: Task {task_id} not found" + + attempt = task_info.get("attempt", 1) + + # Import the report generator module + import importlib.util + report_generator_path = os.path.join( + os.path.dirname(__file__), "generate_benchmark_report.py" + ) + + spec = importlib.util.spec_from_file_location( + "generate_benchmark_report", + report_generator_path, + ) + if spec is None or spec.loader is None: + return f"Error: Could not load report generator module" + + report_module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(report_module) + + # Call the report generator + report_path = report_module.generate_task_report_from_log( + log_dir=str(self.log_folder), + task_id=task_id, + attempt=attempt, + output_dir=None, # Use default output directory + ) + + if report_path and os.path.exists(report_path): + # Read and return the generated report + with open(report_path, "r", encoding="utf-8") as f: + return f.read() + + return f"Error: Failed to generate report for task {task_id}" + + except Exception as e: + return f"Error generating report for task {task_id}: {str(e)}" + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark Monitor with Web Interface") + parser.add_argument("log_folder", nargs="?", default=".", help="Path to benchmark log folder") + parser.add_argument("--web-port", type=int, default=8080, help="Web interface port") + + args = parser.parse_args() + + if not Path(args.log_folder).exists(): + print(f"Error: Log folder not found: {args.log_folder}") + return 1 + + # Create monitor + monitor = BenchmarkMonitor(args.log_folder) + + # Start web dashboard + dashboard = WebDashboard(monitor, args.web_port) + dashboard.start_server() + + print("Benchmark Monitor started") + print(f"Monitoring logs in: {args.log_folder}") + print(f"Web dashboard: http://localhost:{dashboard.port}") + print("Press Ctrl+C to stop") + + try: + while True: + monitor.scan_and_update() + time.sleep(30) # Update every 30 seconds + except KeyboardInterrupt: + print("\nMonitor stopped by user") + + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/utils/progress_check/generate_benchmark_report.py b/utils/progress_check/generate_benchmark_report.py new file mode 100644 index 0000000..ffff25c --- /dev/null +++ b/utils/progress_check/generate_benchmark_report.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Generic Benchmark Task Report Generator + +This script generates detailed text reports for tasks from benchmark log files. +Works with any benchmark dataset (GAIA, FinSearchComp, FutureX, etc.) +""" + +import json +import os +import sys +from pathlib import Path +from typing import Optional, Dict, Any + + +def find_task_log_file(log_dir: str, task_id: str, attempt: int = 1) -> Optional[Path]: + """Find task log file in the log directory""" + log_path = Path(log_dir) + if not log_path.exists(): + return None + + # Try to find the log file + pattern = f"task_{task_id}_attempt_{attempt}.json" + log_file = log_path / pattern + + if log_file.exists(): + return log_file + + # Try without attempt number + pattern = f"task_{task_id}.json" + log_file = log_path / pattern + if log_file.exists(): + return log_file + + return None + + +def load_task_from_log(log_file: Path) -> Optional[Dict[str, Any]]: + """Load task data from log file""" + try: + with open(log_file, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, FileNotFoundError): + return None + + +def extract_question(log_data: Dict[str, Any]) -> str: + """Extract question from log data in various formats""" + # Try different possible locations + if "task_question" in log_data: + return log_data["task_question"] + + if "input" in log_data: + input_data = log_data["input"] + if isinstance(input_data, dict): + if "task_description" in input_data: + return input_data["task_description"] + elif "task_question" in input_data: + return input_data["task_question"] + elif isinstance(input_data, str): + return input_data + + return "N/A" + + +def extract_metadata_info(log_data: Dict[str, Any]) -> Dict[str, Any]: + """Extract metadata information from log data""" + metadata_info = {} + + # Try to get metadata from various locations + metadata = log_data.get("metadata", {}) + if isinstance(metadata, dict): + metadata_info.update(metadata) + + # Also check input.metadata + if "input" in log_data and isinstance(log_data["input"], dict): + input_metadata = log_data["input"].get("metadata", {}) + if isinstance(input_metadata, dict): + metadata_info.update(input_metadata) + + return metadata_info + + +def generate_task_report_from_log( + log_dir: str, + task_id: str, + attempt: int = 1, + output_dir: Optional[str] = None +) -> Optional[str]: + """Generate detailed text report from task log file""" + + # Find the log file + log_file = find_task_log_file(log_dir, task_id, attempt) + if not log_file: + print(f"❌ Error: Log file not found for task {task_id} (attempt {attempt})") + return None + + # Load task data + log_data = load_task_from_log(log_file) + if not log_data: + print(f"❌ Error: Failed to load log file: {log_file}") + return None + + # Set output directory (default to log_dir/reports) + if output_dir is None: + output_dir = os.path.join(log_dir, "reports") + + # Ensure the directory exists + os.makedirs(output_dir, exist_ok=True) + + # Generate report file + report_filename = f"task_{task_id}_report.txt" + report_path = os.path.join(output_dir, report_filename) + + # Extract information + question = extract_question(log_data) + ground_truth = log_data.get("ground_truth", "N/A") + final_answer = log_data.get("final_boxed_answer", log_data.get("final_answer", "N/A")) + status = log_data.get("status", "unknown") + judge_result = log_data.get("judge_result", "N/A") + error = log_data.get("error", "") + + # Extract execution time + execution_time = None + start_time = log_data.get("start_time") + end_time = log_data.get("end_time") + if start_time and end_time: + try: + from datetime import datetime + start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00")) + end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00")) + execution_time = (end_dt - start_dt).total_seconds() + except Exception: + pass + + # Extract metadata + metadata_info = extract_metadata_info(log_data) + + # Generate report + with open(report_path, "w", encoding="utf-8") as f: + f.write("=" * 80 + "\n") + f.write(f"Benchmark Task Report: {task_id}\n") + f.write("=" * 80 + "\n\n") + + # Basic information + f.write("1. Task Basic Information\n") + f.write("-" * 40 + "\n") + f.write(f"Task ID: {task_id}\n") + f.write(f"Status: {status}\n") + f.write(f"Judge Result: {judge_result}\n") + if execution_time: + f.write(f"Execution Time: {execution_time:.2f} seconds\n") + if log_data.get("task_file_name"): + f.write(f"File Attachment: {log_data['task_file_name']}\n") + f.write("\n\n") + + # Question content + f.write("2. Question Content\n") + f.write("-" * 40 + "\n") + f.write(f"{question}\n\n\n") + + # Ground truth answer + f.write("3. Ground Truth Answer\n") + f.write("-" * 40 + "\n") + f.write(f"{ground_truth}\n\n\n") + + # Model answer + f.write("4. Model Answer\n") + f.write("-" * 40 + "\n") + f.write(f"{final_answer}\n\n\n") + + # Error information (if any) + if error: + f.write("5. Error Information\n") + f.write("-" * 40 + "\n") + f.write(f"{error}\n\n\n") + + # Metadata (if available) + if metadata_info: + f.write("6. Task Metadata\n") + f.write("-" * 40 + "\n") + for key, value in metadata_info.items(): + if isinstance(value, dict): + f.write(f"{key}:\n") + for sub_key, sub_value in value.items(): + f.write(f" {sub_key}: {sub_value}\n") + elif isinstance(value, list): + f.write(f"{key}: {', '.join(map(str, value))}\n") + else: + f.write(f"{key}: {value}\n") + f.write("\n\n") + + # Execution steps (if available) + if "step_logs" in log_data and log_data["step_logs"]: + f.write("7. Execution Steps\n") + f.write("-" * 40 + "\n") + f.write(f"Total steps: {len(log_data['step_logs'])}\n") + # Optionally include step details + f.write("\n") + + f.write("=" * 80 + "\n") + f.write("End of Report\n") + f.write("=" * 80 + "\n") + + print(f"📄 Task {task_id} report saved to: {report_path}") + return report_path + + +def main(): + """Main function""" + import argparse + + parser = argparse.ArgumentParser(description="Generate benchmark task reports from log files") + parser.add_argument( + "log_dir", + type=str, + help="Path to benchmark log directory", + ) + parser.add_argument( + "task_id", + type=str, + help="Task ID to generate report for", + ) + parser.add_argument( + "--attempt", + type=int, + default=1, + help="Attempt number (default: 1)", + ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help="Output directory for reports (default: /reports)", + ) + + args = parser.parse_args() + + generate_task_report_from_log( + args.log_dir, + args.task_id, + args.attempt, + args.output_dir + ) + + +if __name__ == "__main__": + main() + From 81b32bc4d38db8e7ffa22e5734f577dfe9c7e1dc Mon Sep 17 00:00:00 2001 From: JoeXic Date: Mon, 10 Nov 2025 22:34:23 +0000 Subject: [PATCH 09/11] update main.py --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index b7260e5..f0bb018 100644 --- a/main.py +++ b/main.py @@ -5,7 +5,7 @@ import utils.calculate_average_score import utils.calculate_score_from_log import common_benchmark -import run_gaia_with_monitor +import run_benchmark_with_monitor import dotenv import utils.eval_answer_from_log import fire @@ -36,7 +36,7 @@ def print_config(*args): "print-config": print_config, "trace": utils.trace_single_task.main, "common-benchmark": common_benchmark.main, - "run-gaia-with-monitor": run_gaia_with_monitor.main, + "run-benchmark-with-monitor": run_benchmark_with_monitor.main, "eval-answer": utils.eval_answer_from_log.main, "avg-score": utils.calculate_average_score.main, "score-from-log": utils.calculate_score_from_log.main, From 787454cff2b1ba5cce7719958a2b538df9e921c5 Mon Sep 17 00:00:00 2001 From: JoeXic Date: Mon, 10 Nov 2025 22:36:57 +0000 Subject: [PATCH 10/11] delete deprecated files --- run_gaia_with_monitor.py | 130 ----- utils/progress_check/gaia_web_monitor.py | 558 ------------------- utils/progress_check/generate_gaia_report.py | 185 ------ 3 files changed, 873 deletions(-) delete mode 100644 run_gaia_with_monitor.py delete mode 100644 utils/progress_check/gaia_web_monitor.py delete mode 100644 utils/progress_check/generate_gaia_report.py diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py deleted file mode 100644 index 694fcb4..0000000 --- a/run_gaia_with_monitor.py +++ /dev/null @@ -1,130 +0,0 @@ -# SPDX-FileCopyrightText: 2025 MiromindAI -# -# SPDX-License-Identifier: Apache-2.0 - -import os -import subprocess -import signal -import sys -import time -from typing import Optional - - -def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080): - """Run benchmark with integrated web monitoring""" - - # Validate required arguments - if not output_dir: - print("Error: output_dir is required") - print( - "Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name" - ) - return 1 - - # Create output directory if it doesn't exist - os.makedirs(output_dir, exist_ok=True) - - print("=" * 50) - print("Benchmark Runner with Monitor") - print("=" * 50) - print(f"Output directory: {output_dir}") - print(f"Config name: {config_file_name}") - print(f"Web port: {web_port}") - print("=" * 50) - - # Global variables for process management - benchmark_process: Optional[subprocess.Popen] = None - monitor_process: Optional[subprocess.Popen] = None - - def cleanup_processes(): - """Clean up running processes""" - print("\nShutting down processes...") - - if benchmark_process and benchmark_process.poll() is None: - print(f"Stopping benchmark (PID: {benchmark_process.pid})...") - benchmark_process.terminate() - try: - benchmark_process.wait(timeout=5) - except subprocess.TimeoutExpired: - benchmark_process.kill() - - if monitor_process and monitor_process.poll() is None: - print(f"Stopping monitor (PID: {monitor_process.pid})...") - monitor_process.terminate() - try: - monitor_process.wait(timeout=5) - except subprocess.TimeoutExpired: - monitor_process.kill() - - print("Cleanup complete.") - - def signal_handler(signum, frame): - """Handle Ctrl+C gracefully""" - cleanup_processes() - sys.exit(0) - - # Set up signal handlers - signal.signal(signal.SIGINT, signal_handler) - signal.signal(signal.SIGTERM, signal_handler) - - try: - # Start benchmark - print("Starting benchmark...") - benchmark_cmd = [ - "uv", - "run", - "main.py", - "common-benchmark", - f"--config_file_name={config_file_name}", - f"output_dir={output_dir}", - ] - benchmark_process = subprocess.Popen(benchmark_cmd) - print(f"Benchmark started with PID: {benchmark_process.pid}") - - # Wait a moment for benchmark to initialize - time.sleep(3) - - # Start monitor - print("Starting web monitor...") - monitor_cmd = [ - "uv", - "run", - "utils/progress_check/gaia_web_monitor.py", - output_dir, - f"--web-port={web_port}", - ] - monitor_process = subprocess.Popen(monitor_cmd) - print(f"Monitor started with PID: {monitor_process.pid}") - print(f"Web dashboard available at: http://localhost:{web_port}") - - print("\n" + "=" * 50) - print("Both processes are running!") - print("Press Ctrl+C to stop both processes") - print("Monitor will continue running even if benchmark finishes") - print("=" * 50) - - # Monitor the processes - while True: - time.sleep(5) - - # Check if benchmark process is still running - if benchmark_process and benchmark_process.poll() is not None: - print("Benchmark process ended") - benchmark_process = None - - # Check if monitor process is still running - if monitor_process and monitor_process.poll() is not None: - print("Monitor process died unexpectedly. Restarting...") - monitor_process = subprocess.Popen(monitor_cmd) - print(f"Monitor restarted with PID: {monitor_process.pid}") - - except KeyboardInterrupt: - cleanup_processes() - - return 0 - - -if __name__ == "__main__": - import fire - - fire.Fire(main) diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py deleted file mode 100644 index 6e43c3b..0000000 --- a/utils/progress_check/gaia_web_monitor.py +++ /dev/null @@ -1,558 +0,0 @@ -""" -GAIA Benchmark Monitor with Web Interface - -This script provides monitoring capabilities including: -- Real-time web dashboard -- Historical data tracking - -Usage: - uv run utils/progress_check/gaia_web_monitor.py [LOG_FOLDER_PATH] [OPTIONS] - -Options: - --web-port PORT Web interface port (default: 8080) -""" - -import json -import time -import argparse -from pathlib import Path -from typing import Dict, List, Any, Optional -from datetime import datetime -import threading -import os -from http.server import HTTPServer, BaseHTTPRequestHandler - - -class WebDashboard: - """Simple web dashboard for monitoring""" - - def __init__(self, monitor, port: int = 8080): - self.monitor = monitor - self.port = port - self.server = None - - def start_server(self): - """Start the web server""" - handler = self.create_handler() - self.server = HTTPServer(("localhost", self.port), handler) - print(f"Web dashboard available at: http://localhost:{self.port}") - - def run_server(): - self.server.serve_forever() - - thread = threading.Thread(target=run_server, daemon=True) - thread.start() - - def create_handler(self): - """Create HTTP request handler""" - monitor = self.monitor - - class DashboardHandler(BaseHTTPRequestHandler): - def do_GET(self): - if self.path == "/": - self.send_dashboard() - elif self.path == "/api/status": - self.send_json(monitor.get_status_json()) - elif self.path == "/api/tasks": - self.send_json(monitor.get_tasks_json()) - elif self.path.startswith("/api/task-report/"): - task_id = self.path.split("/")[-1] - self.send_task_report(task_id) - else: - self.send_error(404) - - def send_dashboard(self): - self.send_response(200) - self.send_header("Content-type", "text/html") - self.end_headers() - - html = self.generate_dashboard_html() - self.wfile.write(html.encode()) - - def send_json(self, data): - self.send_response(200) - self.send_header("Content-type", "application/json") - self.end_headers() - self.wfile.write(json.dumps(data, default=str).encode()) - - def send_task_report(self, task_id): - """Send task report for a specific task""" - try: - # Try to find the task in the current running tasks - task_info = monitor.get_task_info(task_id) - if not task_info: - self.send_error(404, "Task not found") - return - - # Generate report using the generate_gaia_report script - report_content = monitor.generate_task_report(task_id) - if not report_content: - self.send_error(500, "Failed to generate report") - return - - self.send_response(200) - self.send_header("Content-type", "text/plain; charset=utf-8") - self.end_headers() - self.wfile.write(report_content.encode("utf-8")) - - except Exception as e: - self.send_error(500, f"Error generating report: {str(e)}") - - def generate_dashboard_html(self): - return """ - - - - Benchmark Monitor Dashboard - - - - - - -
-

Benchmark Monitor Dashboard

- -
-

Overall Progress

-
-
-
-

Progress: 0%

-
- -
-

Key Metrics

-
-
0
-
Total Tasks
-
-
-
0
-
Completed
-
-
-
0
-
Running
-
-
-
0
-
Failed
-
-
-
0%
-
Accuracy
-
-
- -
-

Recent Tasks

- -
- Loading... -
-
-
- - - """ - - return DashboardHandler - - -class AdvancedBenchmarkMonitor: - """GAIA benchmark monitor with web interface""" - - def __init__(self, log_folder: str): - self.log_folder = Path(log_folder) - self.start_time = datetime.now() - # Alerts removed per user request - - # Statistics tracking - self.stats = { - "total_tasks": 0, - "completed_tasks": 0, - "running_tasks": 0, - "failed_tasks": 0, - "correct_answers": 0, - "incorrect_answers": 0, - "execution_times": [], - "error_types": {}, - "task_types": {}, - "last_update": None, - } - - self.tasks = {} - self.recent_activity = [] - self._generate_gaia_report_module = None - - def _load_generate_gaia_report_module(self): - """Lazy load the generate_gaia_report module""" - if self._generate_gaia_report_module is None: - import importlib.util - - spec = importlib.util.spec_from_file_location( - "generate_gaia_report", - os.path.join(os.path.dirname(__file__), "generate_gaia_report.py"), - ) - if spec is None or spec.loader is None: - return None - self._generate_gaia_report_module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(self._generate_gaia_report_module) - return self._generate_gaia_report_module - - def scan_log_files(self) -> List[Path]: - """Scan for all task log files""" - if not self.log_folder.exists(): - return [] - return sorted( - self.log_folder.glob("task_*_attempt_*.json"), - key=lambda x: x.stat().st_mtime, - reverse=True, - ) - - def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]: - """Parse a single task log file""" - try: - with open(file_path, "r", encoding="utf-8") as f: - return json.load(f) - except (json.JSONDecodeError, FileNotFoundError, KeyError): - return None - - def extract_task_info( - self, data: Dict[str, Any], file_path: Path - ) -> Dict[str, Any]: - """Extract relevant information from task data""" - task_id = data.get("task_id", "unknown") - status = data.get("status", "unknown").lower() - judge_result = data.get("judge_result", "").upper() - final_answer = data.get("final_boxed_answer", "") - error_msg = data.get("error", "") - - # Extract execution time - start_time = data.get("start_time") - end_time = data.get("end_time") - execution_time = None - - if start_time and end_time: - try: - start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00")) - end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00")) - execution_time = (end_dt - start_dt).total_seconds() - except Exception: - pass - - # Extract task type from metadata - task_type = "" - metadata = data.get("metadata", {}) - if isinstance(metadata, dict): - # Try to get task type from various metadata fields - if "Level" in metadata: - task_type = f"Level {metadata['Level']}" - elif "task_type" in metadata: - task_type = str(metadata["task_type"]) - elif "type" in metadata: - task_type = str(metadata["type"]) - elif "difficulty" in metadata: - task_type = f"Difficulty {metadata['difficulty']}" - - return { - "task_id": task_id, - "file_path": str(file_path), - "status": status, - "judge_result": judge_result, - "final_answer": final_answer, - "error": error_msg, - "execution_time": execution_time, - "task_type": task_type, - "last_modified": file_path.stat().st_mtime, - } - - def update_statistics(self, task_info: Dict[str, Any]): - """Update monitoring statistics and check for alerts""" - task_id = task_info["task_id"] - status = task_info["status"] - judge_result = task_info["judge_result"] - execution_time = task_info["execution_time"] - - # Update task tracking - if task_id not in self.tasks: - self.tasks[task_id] = task_info - self.stats["total_tasks"] += 1 - # Only count status for new tasks - if status == "completed": - self.stats["completed_tasks"] += 1 - if judge_result == "CORRECT": - self.stats["correct_answers"] += 1 - elif judge_result in ["INCORRECT", "ERROR"]: - self.stats["incorrect_answers"] += 1 - elif status == "running": - self.stats["running_tasks"] += 1 - elif status in ["failed", "error", "interrupted"]: - self.stats["failed_tasks"] += 1 - else: - # Update existing task - only update if status changed - old_status = self.tasks[task_id]["status"] - if old_status != status: - self.recent_activity.append( - { - "task_id": task_id, - "old_status": old_status, - "new_status": status, - "timestamp": datetime.now(), - } - ) - - # Decrease old status count - if old_status == "completed": - self.stats["completed_tasks"] -= 1 - old_judge_result = self.tasks[task_id]["judge_result"] - if old_judge_result == "CORRECT": - self.stats["correct_answers"] -= 1 - elif old_judge_result in ["INCORRECT", "ERROR"]: - self.stats["incorrect_answers"] -= 1 - elif old_status == "running": - self.stats["running_tasks"] -= 1 - elif old_status in ["failed", "error", "interrupted"]: - self.stats["failed_tasks"] -= 1 - - # Increase new status count - if status == "completed": - self.stats["completed_tasks"] += 1 - if judge_result == "CORRECT": - self.stats["correct_answers"] += 1 - elif judge_result in ["INCORRECT", "ERROR"]: - self.stats["incorrect_answers"] += 1 - elif status == "running": - self.stats["running_tasks"] += 1 - elif status in ["failed", "error", "interrupted"]: - self.stats["failed_tasks"] += 1 - - self.tasks[task_id] = task_info - - # Track execution times - if execution_time is not None: - self.stats["execution_times"].append(execution_time) - if len(self.stats["execution_times"]) > 100: - self.stats["execution_times"] = self.stats["execution_times"][-100:] - - # Alerts removed; no checks performed - - def get_status_json(self) -> Dict[str, Any]: - """Get current status as JSON for web interface""" - total = self.stats["total_tasks"] - completed = self.stats["completed_tasks"] - running = self.stats["running_tasks"] - failed = self.stats["failed_tasks"] - - progress_pct = (completed / total * 100) if total > 0 else 0 - progress_pct = min(progress_pct, 100.0) # Cap at 100% - - total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"] - accuracy = ( - (self.stats["correct_answers"] / total_judged * 100) - if total_judged > 0 - else 0 - ) - - exec_times = self.stats["execution_times"] - avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0 - - elapsed_time = (datetime.now() - self.start_time).total_seconds() - tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0 - - return { - "total_tasks": total, - "completed_tasks": completed, - "running_tasks": running, - "failed_tasks": failed, - "progress_pct": progress_pct, - "accuracy": accuracy, - "avg_execution_time": avg_execution_time, - "tasks_per_second": tasks_per_second, - "last_update": self.stats["last_update"].isoformat() - if self.stats["last_update"] - else None, - } - - def get_tasks_json(self) -> List[Dict[str, Any]]: - """Get tasks list as JSON for web interface""" - return [ - { - "task_id": task_info["task_id"], - "status": task_info["status"], - "judge_result": task_info["judge_result"], - "task_type": task_info["task_type"], - "execution_time": task_info["execution_time"], - } - for task_info in sorted( - self.tasks.values(), key=lambda x: x["last_modified"], reverse=True - ) - ] - - def scan_and_update(self): - """Scan log files and update statistics""" - log_files = self.scan_log_files() - - for file_path in log_files: - data = self.parse_task_file(file_path) - if data: - task_info = self.extract_task_info(data, file_path) - self.update_statistics(task_info) - - self.stats["last_update"] = datetime.now() - - def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]: - """Get information about a specific task""" - return self.tasks.get(task_id) - - def generate_task_report(self, task_id: str) -> Optional[str]: - """Generate the original simple report (no execution details).""" - try: - # Import the original report generator (now in the same directory) - generate_module = self._load_generate_gaia_report_module() - if generate_module is None: - return None - generate_task_report = generate_module.generate_task_report - - # Map task_id to dataset index - task_index = self.find_task_index_in_dataset(task_id) - if task_index is None: - return None - - # Generate and return the plain report content - report_path = generate_task_report(task_index) - if report_path and os.path.exists(report_path): - with open(report_path, "r", encoding="utf-8") as f: - return f.read() - return None - - except Exception as e: - print(f"Error generating simple report for task {task_id}: {e}") - return None - - def find_task_index_in_dataset(self, task_id: str) -> Optional[int]: - """Find the index of a task in the GAIA dataset""" - try: - # Import from the same directory - generate_module = self._load_generate_gaia_report_module() - if generate_module is None: - return None - load_gaia_data = generate_module.load_gaia_data - - # Load GAIA data - tasks = load_gaia_data() - - # Find the task by ID - for i, task in enumerate(tasks): - if task.get("task_id") == task_id: - return i - - return None - - except Exception as e: - print(f"Error finding task {task_id} in dataset: {e}") - return None - - -def main(): - parser = argparse.ArgumentParser(description="GAIA Benchmark Monitor") - parser.add_argument("log_folder", nargs="?", default=".", help="Path to log folder") - parser.add_argument("--web-port", type=int, default=8080, help="Web interface port") - # Alert functionality removed; threshold flag no longer supported - - args = parser.parse_args() - - if not Path(args.log_folder).exists(): - print(f"Error: Log folder not found: {args.log_folder}") - return 1 - - # Create monitor - monitor = AdvancedBenchmarkMonitor(args.log_folder) - - # Start web dashboard - dashboard = WebDashboard(monitor, args.web_port) - dashboard.start_server() - - print("GAIA Benchmark Monitor started") - print(f"Web dashboard: http://localhost:{args.web_port}") - print("Press Ctrl+C to stop") - - try: - while True: - monitor.scan_and_update() - time.sleep(30) # Update every 30 seconds - except KeyboardInterrupt: - print("\nMonitor stopped by user") - - return 0 - - -if __name__ == "__main__": - exit(main()) diff --git a/utils/progress_check/generate_gaia_report.py b/utils/progress_check/generate_gaia_report.py deleted file mode 100644 index 5be2d19..0000000 --- a/utils/progress_check/generate_gaia_report.py +++ /dev/null @@ -1,185 +0,0 @@ -#!/usr/bin/env python3 -""" -GAIA Dataset Task Report Generator - -This script generates detailed text reports for specified tasks in the GAIA-val dataset. -""" - -import json -import os -import sys - - -def find_gaia_data_dir(): - """Find GAIA data directory automatically""" - # Get the directory where this script is located (utils/progress_check/) - script_dir = os.path.dirname(os.path.abspath(__file__)) - # Project root is two levels up from utils/progress_check/ - repo_root = os.path.abspath(os.path.join(script_dir, "..", "..")) - - # Try common locations - possible_paths = [ - os.path.join(repo_root, "data", "gaia-val"), # Project root/data/gaia-val - os.path.join( - script_dir, "..", "data", "gaia-val" - ), # utils/data/gaia-val (unlikely) - os.path.join( - script_dir, "data", "gaia-val" - ), # utils/progress_check/data/gaia-val (unlikely) - "data/gaia-val", # Relative from current working directory - ] - - for path in possible_paths: - abs_path = os.path.abspath(path) - jsonl_path = os.path.join(abs_path, "standardized_data.jsonl") - if os.path.exists(jsonl_path): - return abs_path - - # If not found, return default path (project root/data/gaia-val) - return os.path.join(repo_root, "data", "gaia-val") - - -def load_gaia_data(data_dir=None): - """Load GAIA validation dataset""" - if data_dir is None: - data_dir = find_gaia_data_dir() - - jsonl_path = os.path.join(data_dir, "standardized_data.jsonl") - - if not os.path.exists(jsonl_path): - print(f"❌ Error: GAIA data file not found at {jsonl_path}") - print("Please ensure the GAIA dataset is available in one of these locations:") - print("- data/gaia-val/standardized_data.jsonl") - print("- ../data/gaia-val/standardized_data.jsonl") - print("- Or specify the correct path using --data-dir argument") - sys.exit(1) - - tasks = [] - with open(jsonl_path, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - tasks.append(json.loads(line)) - - return tasks - - -def _default_reports_dir() -> str: - """Return absolute path to the default GAIA reports directory.""" - repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) - reports_dir = os.path.join(repo_root, "gaia_reports") - return reports_dir - - -def generate_task_report(task_index, data_dir=None, output_dir=None): - """Generate detailed text report for specified task""" - print("🚀 Loading GAIA dataset...") - tasks = load_gaia_data(data_dir) - - display_index = task_index + 1 - - if task_index >= len(tasks): - print( - f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks" - ) - return None - - print(f"📄 Generating task {display_index} report...") - - # Get task data - task = tasks[task_index] - - # Set output directory (default to /gaia_reports) - if output_dir is None: - output_dir = _default_reports_dir() - - # Ensure the directory exists - os.makedirs(output_dir, exist_ok=True) - - # Generate report file - report_path = os.path.join(output_dir, f"gaia_task_{display_index}_report.txt") - - with open(report_path, "w", encoding="utf-8") as f: - f.write("=" * 80 + "\n") - f.write(f"GAIA Dataset Task {display_index} Detailed Report\n") - f.write("=" * 80 + "\n\n") - - # Basic information - f.write("1. Task Basic Information\n") - f.write("-" * 40 + "\n") - f.write(f"Task ID: {task['task_id']}\n") - f.write(f"Difficulty Level: Level {task['metadata']['Level']}\n") - f.write(f"File Attachment: {'Yes' if task.get('file_path') else 'No'}\n") - if task.get("file_path"): - f.write(f"File Path: {task['file_path']}\n") - f.write("\n") - - # Question content - f.write("2. Question Content\n") - f.write("-" * 40 + "\n") - f.write(f"{task['task_question']}\n\n") - - # Ground truth answer - f.write("3. Ground Truth Answer\n") - f.write("-" * 40 + "\n") - f.write(f"{task['ground_truth']}\n\n") - - # Solution steps - f.write("4. Detailed Solution Steps\n") - f.write("-" * 40 + "\n") - f.write(f"{task['metadata']['Annotator Metadata']['Steps']}\n\n") - - # Metadata - f.write("5. Task Metadata\n") - f.write("-" * 40 + "\n") - metadata = task["metadata"]["Annotator Metadata"] - for key, value in metadata.items(): - if key != "Steps": # Skip Steps since it's shown in section 4 - if key == "Tools": - f.write(f"{key}:\n{value}\n\n") - else: - f.write(f"{key}: {value}\n\n") - f.write("\n") - - f.write("=" * 80 + "\n") - f.write("End of Report\n") - f.write("=" * 80 + "\n") - - print(f"📄 Task {display_index} detailed report saved to: {report_path}") - - return report_path - - -def main(): - """Main function""" - import argparse - - parser = argparse.ArgumentParser(description="Generate GAIA dataset task reports") - parser.add_argument( - "task_index", - nargs="?", - type=int, - default=1, - help="Task index to generate report for (1-based, default: 1)", - ) - parser.add_argument( - "--data-dir", - type=str, - default=None, - help="Path to GAIA data directory (auto-detected if not specified)", - ) - parser.add_argument( - "--output-dir", - type=str, - default=None, - help="Output directory for reports (default: /gaia_reports)", - ) - - args = parser.parse_args() - - task_index = args.task_index - 1 # Convert to 0-based for internal use - - generate_task_report(task_index, args.data_dir, args.output_dir) - - -if __name__ == "__main__": - main() From b569c1844910318215cf4c8cd372c7e90a8ea185 Mon Sep 17 00:00:00 2001 From: JoeXic Date: Mon, 10 Nov 2025 22:37:41 +0000 Subject: [PATCH 11/11] add import --- utils/progress_check/check_finsearchcomp_progress.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py index 1104582..c96e9cc 100755 --- a/utils/progress_check/check_finsearchcomp_progress.py +++ b/utils/progress_check/check_finsearchcomp_progress.py @@ -21,7 +21,7 @@ import re import sys from pathlib import Path -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Any def extract_task_type(task_id: str) -> str: @@ -61,7 +61,7 @@ def extract_region_from_label(label: str) -> str: return "Unknown" -def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: +def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, Any]: """ Analyze FinSearchComp benchmark results from JSON log files. @@ -192,7 +192,7 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]: def display_results( - results: Dict[str, any], + results: Dict[str, Any], correct_files: List[str], incorrect_files: List[Tuple[str, str]], error_files: List[Tuple[str, str]],