From c0243c13d4ecfafef7a6de73f9b9ec647e979b6e Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Fri, 24 Oct 2025 17:47:20 +0100
Subject: [PATCH 01/11] Add guidance

---
 guide.md | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)
 create mode 100644 guide.md

diff --git a/guide.md b/guide.md
new file mode 100644
index 0000000..51082bb
--- /dev/null
+++ b/guide.md
@@ -0,0 +1,43 @@
+## Git 管理
+- 开源项目的工作流一般都是 fork + 提PR 
+- 一个分支本质上是代码历史中指向某个commit的指针
+- feature/monitor branch: 跑benchmark的时候监控中间过程
+
+## Reproducing GAIA Validation Benchmark Results
+
+**Prepara GAIA vaidation dataset:**
+```bash
+cd data
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip
+unzip gaia-val.zip
+# Unzip passcode: pf4*
+```
+
+**API key configuration:**
+```bash
+touch .env
+nano .env
+```
+
+**Run GAIA validation with Claude 3.7 Sonnet**
+
+```bash
+uv run main.py common-benchmark \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+  ```
+
+**Checking progress:**
+```bash
+uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
+```
+
+**Resume running if interrupted:**
+```bash
+uv run main.py common-benchmark \
+  --config_file_name=agent_gaia-validation_claude37sonnet.yaml \
+  output_dir="$PATH_TO_LOG"
+```
+
+
+## Other Benchmark Datasets
\ No newline at end of file

From 349fb323d35c3f1ee98221af8cfaea150988fa0b Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sat, 1 Nov 2025 16:47:25 +0000
Subject: [PATCH 02/11] feat: Add GAIA benchmark monitoring with web dashboard

- Add run-gaia-with-monitor command for running benchmark with real-time monitoring
- Add web dashboard for monitoring benchmark progress (gaia_web_monitor.py)
- Add generate_gaia_report.py to utils/progress_check/ for generating task reports
---
 main.py                                      |   3 +-
 run_gaia_with_monitor.py                     | 122 +++++
 utils/progress_check/gaia_web_monitor.py     | 549 +++++++++++++++++++
 utils/progress_check/generate_gaia_report.py | 168 ++++++
 4 files changed, 841 insertions(+), 1 deletion(-)
 create mode 100644 run_gaia_with_monitor.py
 create mode 100644 utils/progress_check/gaia_web_monitor.py
 create mode 100644 utils/progress_check/generate_gaia_report.py

diff --git a/main.py b/main.py
index eaa5d81..c0b232c 100644
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@
 import utils.calculate_average_score
 import utils.calculate_score_from_log
 import common_benchmark
+import run_gaia_with_monitor
 import dotenv
 import utils.eval_answer_from_log
 import fire
@@ -27,7 +28,6 @@ def print_config(*args):
         cfg = hydra.compose(config_name=config_name(), overrides=list(args))
         debug_config(cfg, logger)
 
-
 if __name__ == "__main__":
     install(suppress=[fire, hydra], show_locals=True)
     fire.Fire(
@@ -35,6 +35,7 @@ def print_config(*args):
             "print-config": print_config,
             "trace": utils.trace_single_task.main,
             "common-benchmark": common_benchmark.main,
+            "run-gaia-with-monitor": run_gaia_with_monitor.main,
             "eval-answer": utils.eval_answer_from_log.main,
             "avg-score": utils.calculate_average_score.main,
             "score-from-log": utils.calculate_score_from_log.main,
diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py
new file mode 100644
index 0000000..a1e16e9
--- /dev/null
+++ b/run_gaia_with_monitor.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import signal
+import sys
+import time
+from typing import Optional
+
+
+def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
+    """Run benchmark with integrated web monitoring"""
+    
+    # Validate required arguments
+    if not output_dir:
+        print("Error: output_dir is required")
+        print("Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name")
+        return 1
+    
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+    
+    print("=" * 50)
+    print("Benchmark Runner with Monitor")
+    print("=" * 50)
+    print(f"Output directory: {output_dir}")
+    print(f"Config name: {config_file_name}")
+    print(f"Web port: {web_port}")
+    print("=" * 50)
+    
+    # Global variables for process management
+    benchmark_process: Optional[subprocess.Popen] = None
+    monitor_process: Optional[subprocess.Popen] = None
+    
+    def cleanup_processes():
+        """Clean up running processes"""
+        print("\nShutting down processes...")
+        
+        if benchmark_process and benchmark_process.poll() is None:
+            print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
+            benchmark_process.terminate()
+            try:
+                benchmark_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                benchmark_process.kill()
+        
+        if monitor_process and monitor_process.poll() is None:
+            print(f"Stopping monitor (PID: {monitor_process.pid})...")
+            monitor_process.terminate()
+            try:
+                monitor_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                monitor_process.kill()
+        
+        print("Cleanup complete.")
+    
+    def signal_handler(signum, frame):
+        """Handle Ctrl+C gracefully"""
+        cleanup_processes()
+        sys.exit(0)
+    
+    # Set up signal handlers
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+    
+    try:
+        # Start benchmark
+        print("Starting benchmark...")
+        benchmark_cmd = [
+            "uv", "run", "main.py", "common-benchmark",
+            f"--config_file_name={config_file_name}",
+            f"output_dir={output_dir}"
+        ]
+        benchmark_process = subprocess.Popen(benchmark_cmd)
+        print(f"Benchmark started with PID: {benchmark_process.pid}")
+        
+        # Wait a moment for benchmark to initialize
+        time.sleep(3)
+        
+        # Start monitor
+        print("Starting web monitor...")
+        monitor_cmd = [
+            "uv", "run", "utils/progress_check/gaia_web_monitor.py",
+            output_dir,
+            f"--web-port={web_port}"
+        ]
+        monitor_process = subprocess.Popen(monitor_cmd)
+        print(f"Monitor started with PID: {monitor_process.pid}")
+        print(f"Web dashboard available at: http://localhost:{web_port}")
+        
+        print("\n" + "=" * 50)
+        print("Both processes are running!")
+        print("Press Ctrl+C to stop both processes")
+        print("Monitor will continue running even if benchmark finishes")
+        print("=" * 50)
+        
+        # Monitor the processes
+        while True:
+            time.sleep(5)
+            
+            # Check if benchmark process is still running
+            if benchmark_process and benchmark_process.poll() is not None:
+                print("Benchmark process ended")
+                benchmark_process = None
+            
+            # Check if monitor process is still running
+            if monitor_process and monitor_process.poll() is not None:
+                print("Monitor process died unexpectedly. Restarting...")
+                monitor_process = subprocess.Popen(monitor_cmd)
+                print(f"Monitor restarted with PID: {monitor_process.pid}")
+    
+    except KeyboardInterrupt:
+        cleanup_processes()
+    
+    return 0
+
+
+if __name__ == "__main__":
+    import fire
+    fire.Fire(main)
diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py
new file mode 100644
index 0000000..e1efc5a
--- /dev/null
+++ b/utils/progress_check/gaia_web_monitor.py
@@ -0,0 +1,549 @@
+"""
+GAIA Benchmark Monitor with Web Interface
+
+This script provides monitoring capabilities including:
+- Real-time web dashboard
+- Historical data tracking
+
+Usage:
+    uv run utils/progress_check/gaia_web_monitor.py [LOG_FOLDER_PATH] [OPTIONS]
+
+Options:
+    --web-port PORT       Web interface port (default: 8080)
+"""
+
+import json
+import sys
+import time
+import argparse
+from pathlib import Path
+from typing import Dict, List, Tuple, Any, Optional
+from datetime import datetime, timedelta
+import threading
+import os
+from http.server import HTTPServer, BaseHTTPRequestHandler
+import urllib.parse
+
+
+class WebDashboard:
+    """Simple web dashboard for monitoring"""
+    
+    def __init__(self, monitor, port: int = 8080):
+        self.monitor = monitor
+        self.port = port
+        self.server = None
+    
+    def start_server(self):
+        """Start the web server"""
+        handler = self.create_handler()
+        self.server = HTTPServer(('localhost', self.port), handler)
+        print(f"Web dashboard available at: http://localhost:{self.port}")
+        
+        def run_server():
+            self.server.serve_forever()
+        
+        thread = threading.Thread(target=run_server, daemon=True)
+        thread.start()
+    
+    def create_handler(self):
+        """Create HTTP request handler"""
+        monitor = self.monitor
+        
+        class DashboardHandler(BaseHTTPRequestHandler):
+            def do_GET(self):
+                if self.path == '/':
+                    self.send_dashboard()
+                elif self.path == '/api/status':
+                    self.send_json(monitor.get_status_json())
+                elif self.path == '/api/tasks':
+                    self.send_json(monitor.get_tasks_json())
+                elif self.path.startswith('/api/task-report/'):
+                    task_id = self.path.split('/')[-1]
+                    self.send_task_report(task_id)
+                else:
+                    self.send_error(404)
+            
+            def send_dashboard(self):
+                self.send_response(200)
+                self.send_header('Content-type', 'text/html')
+                self.end_headers()
+                
+                html = self.generate_dashboard_html()
+                self.wfile.write(html.encode())
+            
+            def send_json(self, data):
+                self.send_response(200)
+                self.send_header('Content-type', 'application/json')
+                self.end_headers()
+                self.wfile.write(json.dumps(data, default=str).encode())
+            
+            def send_task_report(self, task_id):
+                """Send task report for a specific task"""
+                try:
+                    # Try to find the task in the current running tasks
+                    task_info = monitor.get_task_info(task_id)
+                    if not task_info:
+                        self.send_error(404, "Task not found")
+                        return
+                    
+                    # Generate report using the generate_gaia_report script
+                    report_content = monitor.generate_task_report(task_id)
+                    if not report_content:
+                        self.send_error(500, "Failed to generate report")
+                        return
+                    
+                    self.send_response(200)
+                    self.send_header('Content-type', 'text/plain; charset=utf-8')
+                    self.end_headers()
+                    self.wfile.write(report_content.encode('utf-8'))
+                    
+                except Exception as e:
+                    self.send_error(500, f"Error generating report: {str(e)}")
+            
+            def generate_dashboard_html(self):
+                status = monitor.get_status_json()
+                tasks = monitor.get_tasks_json()
+                
+                return f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Benchmark Monitor Dashboard</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
+        .container {{ max-width: 1200px; margin: 0 auto; }}
+        .card {{ background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .metric {{ display: inline-block; margin: 10px; padding: 15px; background: #e3f2fd; border-radius: 5px; text-align: center; }}
+        .metric-value {{ font-size: 24px; font-weight: bold; color: #1976d2; }}
+        .metric-label {{ font-size: 14px; color: #666; }}
+        .progress-bar {{ width: 100%; height: 20px; background: #e0e0e0; border-radius: 10px; overflow: hidden; }}
+        .progress-fill {{ height: 100%; background: linear-gradient(90deg, #4caf50, #8bc34a); transition: width 0.3s; }}
+        .status-running {{ color: #ff9800; }}
+        .status-completed {{ color: #4caf50; }}
+        .status-failed {{ color: #f44336; }}
+        .task-list {{ max-height: 400px; overflow-y: auto; }}
+        .task-item {{ padding: 8px; border-bottom: 1px solid #eee; }}
+        .refresh-btn {{ background: #2196f3; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }}
+        .refresh-btn:hover {{ background: #1976d2; }}
+        .view-report-btn {{ background: #4caf50; color: white; border: none; padding: 5px 10px; border-radius: 3px; cursor: pointer; margin-left: 10px; font-size: 12px; }}
+        .view-report-btn:hover {{ background: #45a049; }}
+    </style>
+    <script>
+        function refreshData() {{
+            fetch('/api/status')
+                .then(response => response.json())
+                .then(data => updateDashboard(data));
+            
+            fetch('/api/tasks')
+                .then(response => response.json())
+                .then(data => updateTaskList(data));
+        }}
+        
+        function updateDashboard(data) {{
+            document.getElementById('progress-pct').textContent = data.progress_pct.toFixed(1) + '%';
+            document.getElementById('progress-fill').style.width = data.progress_pct + '%';
+            document.getElementById('total-tasks').textContent = data.total_tasks;
+            document.getElementById('completed-tasks').textContent = data.completed_tasks;
+            document.getElementById('running-tasks').textContent = data.running_tasks;
+            document.getElementById('failed-tasks').textContent = data.failed_tasks;
+            document.getElementById('accuracy').textContent = data.accuracy.toFixed(1) + '%';
+        }}
+        
+        function updateTaskList(tasks) {{
+            const container = document.getElementById('task-list');
+            container.innerHTML = '';
+            tasks.forEach(task => {{
+                const div = document.createElement('div');
+                div.className = 'task-item';
+                const taskTypeDisplay = task.task_type ? `<small>${{task.task_type}}</small>` : '';
+                div.innerHTML = `
+                    <strong>${{task.task_id}}</strong> - 
+                    <span class="status-${{task.status}}">${{task.status}}</span> - 
+                    ${{task.judge_result}}${{taskTypeDisplay ? ' - ' + taskTypeDisplay : ''}}
+                    <button onclick="viewTaskReport('${{task.task_id}}')" class="view-report-btn">View Report</button>
+                `;
+                container.appendChild(div);
+            }});
+        }}
+        
+        function viewTaskReport(taskId) {{
+            // Open task report in a new window
+            window.open(`/api/task-report/${{taskId}}`, '_blank');
+        }}
+        
+        // Auto-refresh every 30 seconds
+        setInterval(refreshData, 30000);
+        
+        // Initial load
+        window.onload = refreshData;
+    </script>
+</head>
+<body>
+    <div class="container">
+        <h1>Benchmark Monitor Dashboard</h1>
+        
+        <div class="card">
+            <h2>Overall Progress</h2>
+            <div class="progress-bar">
+                <div class="progress-fill" id="progress-fill" style="width: 0%"></div>
+            </div>
+            <p>Progress: <span id="progress-pct">0%</span></p>
+        </div>
+        
+        <div class="card">
+            <h2>Key Metrics</h2>
+            <div class="metric">
+                <div class="metric-value" id="total-tasks">0</div>
+                <div class="metric-label">Total Tasks</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value" id="completed-tasks">0</div>
+                <div class="metric-label">Completed</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value" id="running-tasks">0</div>
+                <div class="metric-label">Running</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value" id="failed-tasks">0</div>
+                <div class="metric-label">Failed</div>
+            </div>
+            <div class="metric">
+                <div class="metric-value" id="accuracy">0%</div>
+                <div class="metric-label">Accuracy</div>
+            </div>
+        </div>
+        
+        <div class="card">
+            <h2>Recent Tasks</h2>
+            <button class="refresh-btn" onclick="refreshData()">Refresh</button>
+            <div class="task-list" id="task-list">
+                Loading...
+            </div>
+        </div>
+    </div>
+</body>
+</html>
+                """
+        
+        return DashboardHandler
+
+
+class AdvancedBenchmarkMonitor:
+    """GAIA benchmark monitor with web interface"""
+    
+    def __init__(self, log_folder: str):
+        self.log_folder = Path(log_folder)
+        self.start_time = datetime.now()
+        # Alerts removed per user request
+        
+        # Statistics tracking
+        self.stats = {
+            "total_tasks": 0,
+            "completed_tasks": 0,
+            "running_tasks": 0,
+            "failed_tasks": 0,
+            "correct_answers": 0,
+            "incorrect_answers": 0,
+            "execution_times": [],
+            "error_types": {},
+            "task_types": {},
+            "last_update": None
+        }
+        
+        self.tasks = {}
+        self.recent_activity = []
+        self._generate_gaia_report_module = None
+    
+    def _load_generate_gaia_report_module(self):
+        """Lazy load the generate_gaia_report module"""
+        if self._generate_gaia_report_module is None:
+            import importlib.util
+            spec = importlib.util.spec_from_file_location(
+                "generate_gaia_report",
+                os.path.join(os.path.dirname(__file__), "generate_gaia_report.py")
+            )
+            if spec is None or spec.loader is None:
+                return None
+            self._generate_gaia_report_module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(self._generate_gaia_report_module)
+        return self._generate_gaia_report_module
+    
+    def scan_log_files(self) -> List[Path]:
+        """Scan for all task log files"""
+        if not self.log_folder.exists():
+            return []
+        return sorted(self.log_folder.glob("task_*_attempt_*.json"), 
+                     key=lambda x: x.stat().st_mtime, reverse=True)
+    
+    def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
+        """Parse a single task log file"""
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, FileNotFoundError, KeyError):
+            return None
+    
+    def extract_task_info(self, data: Dict[str, Any], file_path: Path) -> Dict[str, Any]:
+        """Extract relevant information from task data"""
+        task_id = data.get("task_id", "unknown")
+        status = data.get("status", "unknown").lower()
+        judge_result = data.get("judge_result", "").upper()
+        final_answer = data.get("final_boxed_answer", "")
+        error_msg = data.get("error", "")
+        
+        # Extract execution time
+        start_time = data.get("start_time")
+        end_time = data.get("end_time")
+        execution_time = None
+        
+        if start_time and end_time:
+            try:
+                start_dt = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
+                end_dt = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
+                execution_time = (end_dt - start_dt).total_seconds()
+            except:
+                pass
+        
+        # Extract task type from metadata
+        task_type = ""
+        metadata = data.get("metadata", {})
+        if isinstance(metadata, dict):
+            # Try to get task type from various metadata fields
+            if "Level" in metadata:
+                task_type = f"Level {metadata['Level']}"
+            elif "task_type" in metadata:
+                task_type = str(metadata["task_type"])
+            elif "type" in metadata:
+                task_type = str(metadata["type"])
+            elif "difficulty" in metadata:
+                task_type = f"Difficulty {metadata['difficulty']}"
+        
+        return {
+            "task_id": task_id,
+            "file_path": str(file_path),
+            "status": status,
+            "judge_result": judge_result,
+            "final_answer": final_answer,
+            "error": error_msg,
+            "execution_time": execution_time,
+            "task_type": task_type,
+            "last_modified": file_path.stat().st_mtime
+        }
+    
+    def update_statistics(self, task_info: Dict[str, Any]):
+        """Update monitoring statistics and check for alerts"""
+        task_id = task_info["task_id"]
+        status = task_info["status"]
+        judge_result = task_info["judge_result"]
+        execution_time = task_info["execution_time"]
+        task_type = task_info["task_type"]
+        
+        # Update task tracking
+        if task_id not in self.tasks:
+            self.tasks[task_id] = task_info
+            self.stats["total_tasks"] += 1
+            # Only count status for new tasks
+            if status == "completed":
+                self.stats["completed_tasks"] += 1
+                if judge_result == "CORRECT":
+                    self.stats["correct_answers"] += 1
+                elif judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] += 1
+            elif status == "running":
+                self.stats["running_tasks"] += 1
+            elif status in ["failed", "error", "interrupted"]:
+                self.stats["failed_tasks"] += 1
+        else:
+            # Update existing task - only update if status changed
+            old_status = self.tasks[task_id]["status"]
+            if old_status != status:
+                self.recent_activity.append({
+                    "task_id": task_id,
+                    "old_status": old_status,
+                    "new_status": status,
+                    "timestamp": datetime.now()
+                })
+                
+                # Decrease old status count
+                if old_status == "completed":
+                    self.stats["completed_tasks"] -= 1
+                    old_judge_result = self.tasks[task_id]["judge_result"]
+                    if old_judge_result == "CORRECT":
+                        self.stats["correct_answers"] -= 1
+                    elif old_judge_result in ["INCORRECT", "ERROR"]:
+                        self.stats["incorrect_answers"] -= 1
+                elif old_status == "running":
+                    self.stats["running_tasks"] -= 1
+                elif old_status in ["failed", "error", "interrupted"]:
+                    self.stats["failed_tasks"] -= 1
+                
+                # Increase new status count
+                if status == "completed":
+                    self.stats["completed_tasks"] += 1
+                    if judge_result == "CORRECT":
+                        self.stats["correct_answers"] += 1
+                    elif judge_result in ["INCORRECT", "ERROR"]:
+                        self.stats["incorrect_answers"] += 1
+                elif status == "running":
+                    self.stats["running_tasks"] += 1
+                elif status in ["failed", "error", "interrupted"]:
+                    self.stats["failed_tasks"] += 1
+                
+            self.tasks[task_id] = task_info
+        
+        # Track execution times
+        if execution_time is not None:
+            self.stats["execution_times"].append(execution_time)
+            if len(self.stats["execution_times"]) > 100:
+                self.stats["execution_times"] = self.stats["execution_times"][-100:]
+        
+        # Alerts removed; no checks performed
+    
+    def get_status_json(self) -> Dict[str, Any]:
+        """Get current status as JSON for web interface"""
+        total = self.stats["total_tasks"]
+        completed = self.stats["completed_tasks"]
+        running = self.stats["running_tasks"]
+        failed = self.stats["failed_tasks"]
+        
+        progress_pct = (completed / total * 100) if total > 0 else 0
+        progress_pct = min(progress_pct, 100.0)  # Cap at 100%
+        
+        total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"]
+        accuracy = (self.stats["correct_answers"] / total_judged * 100) if total_judged > 0 else 0
+        
+        exec_times = self.stats["execution_times"]
+        avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0
+        
+        elapsed_time = (datetime.now() - self.start_time).total_seconds()
+        tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0
+        
+        return {
+            "total_tasks": total,
+            "completed_tasks": completed,
+            "running_tasks": running,
+            "failed_tasks": failed,
+            "progress_pct": progress_pct,
+            "accuracy": accuracy,
+            "avg_execution_time": avg_execution_time,
+            "tasks_per_second": tasks_per_second,
+            "last_update": self.stats["last_update"].isoformat() if self.stats["last_update"] else None
+        }
+    
+    def get_tasks_json(self) -> List[Dict[str, Any]]:
+        """Get tasks list as JSON for web interface"""
+        return [
+            {
+                "task_id": task_info["task_id"],
+                "status": task_info["status"],
+                "judge_result": task_info["judge_result"],
+                "task_type": task_info["task_type"],
+                "execution_time": task_info["execution_time"]
+            }
+            for task_info in sorted(self.tasks.values(), key=lambda x: x["last_modified"], reverse=True)
+        ]
+    
+    def scan_and_update(self):
+        """Scan log files and update statistics"""
+        log_files = self.scan_log_files()
+        
+        for file_path in log_files:
+            data = self.parse_task_file(file_path)
+            if data:
+                task_info = self.extract_task_info(data, file_path)
+                self.update_statistics(task_info)
+        
+        self.stats["last_update"] = datetime.now()
+    
+    def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Get information about a specific task"""
+        return self.tasks.get(task_id)
+    
+    def generate_task_report(self, task_id: str) -> Optional[str]:
+        """Generate the original simple report (no execution details)."""
+        try:
+            # Import the original report generator (now in the same directory)
+            generate_module = self._load_generate_gaia_report_module()
+            if generate_module is None:
+                return None
+            generate_task_report = generate_module.generate_task_report
+
+            # Map task_id to dataset index
+            task_index = self.find_task_index_in_dataset(task_id)
+            if task_index is None:
+                return None
+
+            # Generate and return the plain report content
+            report_path = generate_task_report(task_index)
+            if report_path and os.path.exists(report_path):
+                with open(report_path, 'r', encoding='utf-8') as f:
+                    return f.read()
+            return None
+
+        except Exception as e:
+            print(f"Error generating simple report for task {task_id}: {e}")
+            return None
+    
+    
+    def find_task_index_in_dataset(self, task_id: str) -> Optional[int]:
+        """Find the index of a task in the GAIA dataset"""
+        try:
+            # Import from the same directory
+            generate_module = self._load_generate_gaia_report_module()
+            if generate_module is None:
+                return None
+            load_gaia_data = generate_module.load_gaia_data
+            
+            # Load GAIA data
+            tasks = load_gaia_data()
+            
+            # Find the task by ID
+            for i, task in enumerate(tasks):
+                if task.get('task_id') == task_id:
+                    return i
+            
+            return None
+            
+        except Exception as e:
+            print(f"Error finding task {task_id} in dataset: {e}")
+            return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="GAIA Benchmark Monitor")
+    parser.add_argument("log_folder", nargs="?", default=".", help="Path to log folder")
+    parser.add_argument("--web-port", type=int, default=8080, help="Web interface port")
+    # Alert functionality removed; threshold flag no longer supported
+    
+    args = parser.parse_args()
+    
+    if not Path(args.log_folder).exists():
+        print(f"Error: Log folder not found: {args.log_folder}")
+        return 1
+    
+    # Create monitor
+    monitor = AdvancedBenchmarkMonitor(args.log_folder)
+    
+    # Start web dashboard
+    dashboard = WebDashboard(monitor, args.web_port)
+    dashboard.start_server()
+    
+    print("GAIA Benchmark Monitor started")
+    print(f"Web dashboard: http://localhost:{args.web_port}")
+    print("Press Ctrl+C to stop")
+    
+    try:
+        while True:
+            monitor.scan_and_update()
+            time.sleep(30)  # Update every 30 seconds
+    except KeyboardInterrupt:
+        print("\nMonitor stopped by user")
+    
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/utils/progress_check/generate_gaia_report.py b/utils/progress_check/generate_gaia_report.py
new file mode 100644
index 0000000..ce1651e
--- /dev/null
+++ b/utils/progress_check/generate_gaia_report.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+"""
+GAIA Dataset Task Report Generator
+
+This script generates detailed text reports for specified tasks in the GAIA-val dataset.
+"""
+
+import json
+import os
+import sys
+from datetime import datetime
+
+
+def find_gaia_data_dir():
+    """Find GAIA data directory automatically"""
+    # Get the directory where this script is located (utils/progress_check/)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    # Project root is two levels up from utils/progress_check/
+    repo_root = os.path.abspath(os.path.join(script_dir, "..", ".."))
+    
+    # Try common locations
+    possible_paths = [
+        os.path.join(repo_root, "data", "gaia-val"),  # Project root/data/gaia-val
+        os.path.join(script_dir, "..", "data", "gaia-val"),  # utils/data/gaia-val (unlikely)
+        os.path.join(script_dir, "data", "gaia-val"),  # utils/progress_check/data/gaia-val (unlikely)
+        "data/gaia-val",  # Relative from current working directory
+    ]
+
+    for path in possible_paths:
+        abs_path = os.path.abspath(path)
+        jsonl_path = os.path.join(abs_path, "standardized_data.jsonl")
+        if os.path.exists(jsonl_path):
+            return abs_path
+
+    # If not found, return default path (project root/data/gaia-val)
+    return os.path.join(repo_root, "data", "gaia-val")
+
+
+def load_gaia_data(data_dir=None):
+    """Load GAIA validation dataset"""
+    if data_dir is None:
+        data_dir = find_gaia_data_dir()
+
+    jsonl_path = os.path.join(data_dir, "standardized_data.jsonl")
+
+    if not os.path.exists(jsonl_path):
+        print(f"❌ Error: GAIA data file not found at {jsonl_path}")
+        print("Please ensure the GAIA dataset is available in one of these locations:")
+        print("- data/gaia-val/standardized_data.jsonl")
+        print("- ../data/gaia-val/standardized_data.jsonl")
+        print("- Or specify the correct path using --data-dir argument")
+        sys.exit(1)
+
+    tasks = []
+    with open(jsonl_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line.strip():
+                tasks.append(json.loads(line))
+
+    return tasks
+
+
+def _default_reports_dir() -> str:
+    """Return absolute path to the default GAIA reports directory."""
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+    reports_dir = os.path.join(repo_root, "gaia_reports")
+    return reports_dir
+
+
+def generate_task_report(task_index, data_dir=None, output_dir=None):
+    """Generate detailed text report for specified task"""
+    print(f"🚀 Loading GAIA dataset...")
+    tasks = load_gaia_data(data_dir)
+
+    display_index = task_index + 1
+
+    if task_index >= len(tasks):
+        print(f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks")
+        return None
+
+    print(f"📄 Generating task {display_index} report...")
+
+    # Get task data
+    task = tasks[task_index]
+
+    # Set output directory (default to <repo_root>/gaia_reports)
+    if output_dir is None:
+        output_dir = _default_reports_dir()
+
+    # Ensure the directory exists
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Generate report file
+    report_path = os.path.join(output_dir, f'gaia_task_{display_index}_report.txt')
+
+    with open(report_path, 'w', encoding='utf-8') as f:
+        f.write("=" * 80 + "\n")
+        f.write(f"GAIA Dataset Task {display_index} Detailed Report\n")
+        f.write("=" * 80 + "\n\n")
+
+        # Basic information
+        f.write("1. Task Basic Information\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"Task ID: {task['task_id']}\n")
+        f.write(f"Difficulty Level: Level {task['metadata']['Level']}\n")
+        f.write(f"File Attachment: {'Yes' if task.get('file_path') else 'No'}\n")
+        if task.get('file_path'):
+            f.write(f"File Path: {task['file_path']}\n")
+        f.write("\n")
+
+        # Question content
+        f.write("2. Question Content\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{task['task_question']}\n\n")
+
+        # Ground truth answer
+        f.write("3. Ground Truth Answer\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{task['ground_truth']}\n\n")
+
+        # Solution steps
+        f.write("4. Detailed Solution Steps\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{task['metadata']['Annotator Metadata']['Steps']}\n\n")
+
+        # Metadata
+        f.write("5. Task Metadata\n")
+        f.write("-" * 40 + "\n")
+        metadata = task['metadata']['Annotator Metadata']
+        for key, value in metadata.items():
+            if key != 'Steps':  # Skip Steps since it's shown in section 4
+                if key == 'Tools':
+                    f.write(f"{key}:\n{value}\n\n")
+                else:
+                    f.write(f"{key}: {value}\n\n")
+        f.write("\n")
+
+        f.write("=" * 80 + "\n")
+        f.write("End of Report\n")
+        f.write("=" * 80 + "\n")
+
+    print(f"📄 Task {display_index} detailed report saved to: {report_path}")
+
+    return report_path
+
+
+def main():
+    """Main function"""
+    import argparse
+
+    parser = argparse.ArgumentParser(description='Generate GAIA dataset task reports')
+    parser.add_argument('task_index', nargs='?', type=int, default=1,
+                       help='Task index to generate report for (1-based, default: 1)')
+    parser.add_argument('--data-dir', type=str, default=None,
+                       help='Path to GAIA data directory (auto-detected if not specified)')
+    parser.add_argument('--output-dir', type=str, default=None,
+                       help='Output directory for reports (default: <repo_root>/gaia_reports)')
+
+    args = parser.parse_args()
+
+    task_index = args.task_index - 1  # Convert to 0-based for internal use
+
+    generate_task_report(task_index, args.data_dir, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
+

From a1775236505e3188434b72aa0268438797ecf9d0 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sat, 1 Nov 2025 17:01:32 +0000
Subject: [PATCH 03/11] spacing

---
 main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.py b/main.py
index c0b232c..b7260e5 100644
--- a/main.py
+++ b/main.py
@@ -28,6 +28,7 @@ def print_config(*args):
         cfg = hydra.compose(config_name=config_name(), overrides=list(args))
         debug_config(cfg, logger)
 
+
 if __name__ == "__main__":
     install(suppress=[fire, hydra], show_locals=True)
     fire.Fire(

From 33c96d24887e77e488b0090fdce50b96f151d12e Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sat, 1 Nov 2025 20:13:10 +0000
Subject: [PATCH 04/11] Remove guide.md from tracking

---
 guide.md | 43 -------------------------------------------
 1 file changed, 43 deletions(-)
 delete mode 100644 guide.md

diff --git a/guide.md b/guide.md
deleted file mode 100644
index 51082bb..0000000
--- a/guide.md
+++ /dev/null
@@ -1,43 +0,0 @@
-## Git 管理
-- 开源项目的工作流一般都是 fork + 提PR 
-- 一个分支本质上是代码历史中指向某个commit的指针
-- feature/monitor branch: 跑benchmark的时候监控中间过程
-
-## Reproducing GAIA Validation Benchmark Results
-
-**Prepara GAIA vaidation dataset:**
-```bash
-cd data
-wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip
-unzip gaia-val.zip
-# Unzip passcode: pf4*
-```
-
-**API key configuration:**
-```bash
-touch .env
-nano .env
-```
-
-**Run GAIA validation with Claude 3.7 Sonnet**
-
-```bash
-uv run main.py common-benchmark \
-  --config_file_name=agent_gaia-validation_claude37sonnet \
-  output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
-  ```
-
-**Checking progress:**
-```bash
-uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
-```
-
-**Resume running if interrupted:**
-```bash
-uv run main.py common-benchmark \
-  --config_file_name=agent_gaia-validation_claude37sonnet.yaml \
-  output_dir="$PATH_TO_LOG"
-```
-
-
-## Other Benchmark Datasets
\ No newline at end of file

From fc18fef3e292ef082196f0fcb66cf8298f0b6405 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sat, 1 Nov 2025 20:15:24 +0000
Subject: [PATCH 05/11] add guide.md

---
 guide.md | 107 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 guide.md

diff --git a/guide.md b/guide.md
new file mode 100644
index 0000000..fc47dde
--- /dev/null
+++ b/guide.md
@@ -0,0 +1,107 @@
+## Git 管理
+- 开源项目的工作流一般都是 fork + 提PR 
+- 一个分支本质上是代码历史中指向某个commit的指针
+- feature/monitor branch: 跑benchmark的时候监控中间过程
+- upstream
+
+流程：fork repo -> 创建feature branch -> 提PR -> 通过后merge到自己repo的main
+
+## 简便设置
+Edit bash files and python files to run monitoring easily
+
+## Reproducing GAIA Validation Benchmark Results
+
+**Prepara GAIA vaidation dataset:**
+```bash
+cd data
+wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip
+unzip gaia-val.zip
+# Unzip passcode: pf4*
+```
+
+**API key configuration:**
+```bash
+touch .env
+nano .env
+```
+
+**Run GAIA validation with Claude 3.7 Sonnet**
+
+```bash
+uv run main.py common-benchmark \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+  ```
+
+**Run GAIA validation with integrated web monitoring:**
+
+```bash
+uv run main.py run-gaia-with-monitor \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+This will start both the benchmark and a web dashboard at http://localhost:8080 for real-time monitoring.
+
+**Alternative: Using the shell script:**
+
+```bash
+./utils/progress_check/run_with_monitor.sh "logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+**Checking progress:**
+```bash
+uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
+```
+
+**Start monitoring for existing logs:**
+```bash
+./utils/progress_check/run_with_monitor.sh --monitor-only $PATH_TO_LOG
+```
+
+**Resume running if interrupted:**
+```bash
+uv run main.py common-benchmark \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  output_dir="$PATH_TO_LOG"
+```
+
+## Visualization (gaia-val)
+```bash
+uv run utils/progress_check/generate_gaia_report.py <task_id>
+```
+
+## Other Benchmark Datasets
+Prepare dataset:
+```bash
+uv run prepare-benchmark get futurex # etc
+```
+
+Run benchmark
+```bash
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
+```
+
+Check progress while running
+```bash
+uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG
+```
+
+Resume interrupted evaluation
+```bash
+uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="$PATH_TO_LOG"
+```
+
+## Run/resume GAIA-val with web monitor
+```bash
+uv run main.py run-gaia-with-monitor \
+--config_file_name=agent_gaia-validation_claude37sonnet \
+--output_dir="$PATH_TO_LOG"
+```
+
+related files:
+- `main.py`
+- `run_gaia_with_monitor.py`
+- `utils/progress_check/generate_gaia_report.py`
+- `utils/progress_check/gaia_web_monitor.py`
+

From 3507659e6998e9e2e7f0a6fc310998089cc01a78 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sat, 1 Nov 2025 23:41:53 +0000
Subject: [PATCH 06/11] Remove guide.md

---
 guide.md | 107 -------------------------------------------------------
 1 file changed, 107 deletions(-)
 delete mode 100644 guide.md

diff --git a/guide.md b/guide.md
deleted file mode 100644
index fc47dde..0000000
--- a/guide.md
+++ /dev/null
@@ -1,107 +0,0 @@
-## Git 管理
-- 开源项目的工作流一般都是 fork + 提PR 
-- 一个分支本质上是代码历史中指向某个commit的指针
-- feature/monitor branch: 跑benchmark的时候监控中间过程
-- upstream
-
-流程：fork repo -> 创建feature branch -> 提PR -> 通过后merge到自己repo的main
-
-## 简便设置
-Edit bash files and python files to run monitoring easily
-
-## Reproducing GAIA Validation Benchmark Results
-
-**Prepara GAIA vaidation dataset:**
-```bash
-cd data
-wget https://huggingface.co/datasets/miromind-ai/MiroFlow-Benchmarks/resolve/main/gaia-val.zip
-unzip gaia-val.zip
-# Unzip passcode: pf4*
-```
-
-**API key configuration:**
-```bash
-touch .env
-nano .env
-```
-
-**Run GAIA validation with Claude 3.7 Sonnet**
-
-```bash
-uv run main.py common-benchmark \
-  --config_file_name=agent_gaia-validation_claude37sonnet \
-  output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
-  ```
-
-**Run GAIA validation with integrated web monitoring:**
-
-```bash
-uv run main.py run-gaia-with-monitor \
-  --config_file_name=agent_gaia-validation_claude37sonnet \
-  --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
-```
-
-This will start both the benchmark and a web dashboard at http://localhost:8080 for real-time monitoring.
-
-**Alternative: Using the shell script:**
-
-```bash
-./utils/progress_check/run_with_monitor.sh "logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
-```
-
-**Checking progress:**
-```bash
-uv run utils/progress_check/check_gaia_progress.py $PATH_TO_LOG
-```
-
-**Start monitoring for existing logs:**
-```bash
-./utils/progress_check/run_with_monitor.sh --monitor-only $PATH_TO_LOG
-```
-
-**Resume running if interrupted:**
-```bash
-uv run main.py common-benchmark \
-  --config_file_name=agent_gaia-validation_claude37sonnet \
-  output_dir="$PATH_TO_LOG"
-```
-
-## Visualization (gaia-val)
-```bash
-uv run utils/progress_check/generate_gaia_report.py <task_id>
-```
-
-## Other Benchmark Datasets
-Prepare dataset:
-```bash
-uv run prepare-benchmark get futurex # etc
-```
-
-Run benchmark
-```bash
-uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="logs/finsearchcomp/$(date +"%Y%m%d_%H%M")"
-```
-
-Check progress while running
-```bash
-uv run utils/progress_check/check_finsearchcomp_progress.py $PATH_TO_LOG
-```
-
-Resume interrupted evaluation
-```bash
-uv run main.py common-benchmark --config_file_name=agent_finsearchcomp_claude37sonnet benchmark=finsearchcomp output_dir="$PATH_TO_LOG"
-```
-
-## Run/resume GAIA-val with web monitor
-```bash
-uv run main.py run-gaia-with-monitor \
---config_file_name=agent_gaia-validation_claude37sonnet \
---output_dir="$PATH_TO_LOG"
-```
-
-related files:
-- `main.py`
-- `run_gaia_with_monitor.py`
-- `utils/progress_check/generate_gaia_report.py`
-- `utils/progress_check/gaia_web_monitor.py`
-

From cb853d639d35eecd7deb84365b8bc3cfc2f92cff Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Sun, 2 Nov 2025 00:19:23 +0000
Subject: [PATCH 07/11] style: format code with ruff

---
 run_gaia_with_monitor.py                     |  56 ++--
 utils/progress_check/gaia_web_monitor.py     | 271 ++++++++++---------
 utils/progress_check/generate_gaia_report.py |  59 ++--
 3 files changed, 210 insertions(+), 176 deletions(-)

diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py
index a1e16e9..694fcb4 100644
--- a/run_gaia_with_monitor.py
+++ b/run_gaia_with_monitor.py
@@ -12,16 +12,18 @@
 
 def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
     """Run benchmark with integrated web monitoring"""
-    
+
     # Validate required arguments
     if not output_dir:
         print("Error: output_dir is required")
-        print("Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name")
+        print(
+            "Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name"
+        )
         return 1
-    
+
     # Create output directory if it doesn't exist
     os.makedirs(output_dir, exist_ok=True)
-    
+
     print("=" * 50)
     print("Benchmark Runner with Monitor")
     print("=" * 50)
@@ -29,15 +31,15 @@ def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int
     print(f"Config name: {config_file_name}")
     print(f"Web port: {web_port}")
     print("=" * 50)
-    
+
     # Global variables for process management
     benchmark_process: Optional[subprocess.Popen] = None
     monitor_process: Optional[subprocess.Popen] = None
-    
+
     def cleanup_processes():
         """Clean up running processes"""
         print("\nShutting down processes...")
-        
+
         if benchmark_process and benchmark_process.poll() is None:
             print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
             benchmark_process.terminate()
@@ -45,7 +47,7 @@ def cleanup_processes():
                 benchmark_process.wait(timeout=5)
             except subprocess.TimeoutExpired:
                 benchmark_process.kill()
-        
+
         if monitor_process and monitor_process.poll() is None:
             print(f"Stopping monitor (PID: {monitor_process.pid})...")
             monitor_process.terminate()
@@ -53,70 +55,76 @@ def cleanup_processes():
                 monitor_process.wait(timeout=5)
             except subprocess.TimeoutExpired:
                 monitor_process.kill()
-        
+
         print("Cleanup complete.")
-    
+
     def signal_handler(signum, frame):
         """Handle Ctrl+C gracefully"""
         cleanup_processes()
         sys.exit(0)
-    
+
     # Set up signal handlers
     signal.signal(signal.SIGINT, signal_handler)
     signal.signal(signal.SIGTERM, signal_handler)
-    
+
     try:
         # Start benchmark
         print("Starting benchmark...")
         benchmark_cmd = [
-            "uv", "run", "main.py", "common-benchmark",
+            "uv",
+            "run",
+            "main.py",
+            "common-benchmark",
             f"--config_file_name={config_file_name}",
-            f"output_dir={output_dir}"
+            f"output_dir={output_dir}",
         ]
         benchmark_process = subprocess.Popen(benchmark_cmd)
         print(f"Benchmark started with PID: {benchmark_process.pid}")
-        
+
         # Wait a moment for benchmark to initialize
         time.sleep(3)
-        
+
         # Start monitor
         print("Starting web monitor...")
         monitor_cmd = [
-            "uv", "run", "utils/progress_check/gaia_web_monitor.py",
+            "uv",
+            "run",
+            "utils/progress_check/gaia_web_monitor.py",
             output_dir,
-            f"--web-port={web_port}"
+            f"--web-port={web_port}",
         ]
         monitor_process = subprocess.Popen(monitor_cmd)
         print(f"Monitor started with PID: {monitor_process.pid}")
         print(f"Web dashboard available at: http://localhost:{web_port}")
-        
+
         print("\n" + "=" * 50)
         print("Both processes are running!")
         print("Press Ctrl+C to stop both processes")
         print("Monitor will continue running even if benchmark finishes")
         print("=" * 50)
-        
+
         # Monitor the processes
         while True:
             time.sleep(5)
-            
+
             # Check if benchmark process is still running
             if benchmark_process and benchmark_process.poll() is not None:
                 print("Benchmark process ended")
                 benchmark_process = None
-            
+
             # Check if monitor process is still running
             if monitor_process and monitor_process.poll() is not None:
                 print("Monitor process died unexpectedly. Restarting...")
                 monitor_process = subprocess.Popen(monitor_cmd)
                 print(f"Monitor restarted with PID: {monitor_process.pid}")
-    
+
     except KeyboardInterrupt:
         cleanup_processes()
-    
+
     return 0
 
 
 if __name__ == "__main__":
     import fire
+
     fire.Fire(main)
diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py
index e1efc5a..6e43c3b 100644
--- a/utils/progress_check/gaia_web_monitor.py
+++ b/utils/progress_check/gaia_web_monitor.py
@@ -13,70 +13,68 @@
 """
 
 import json
-import sys
 import time
 import argparse
 from pathlib import Path
-from typing import Dict, List, Tuple, Any, Optional
-from datetime import datetime, timedelta
+from typing import Dict, List, Any, Optional
+from datetime import datetime
 import threading
 import os
 from http.server import HTTPServer, BaseHTTPRequestHandler
-import urllib.parse
 
 
 class WebDashboard:
     """Simple web dashboard for monitoring"""
-    
+
     def __init__(self, monitor, port: int = 8080):
         self.monitor = monitor
         self.port = port
         self.server = None
-    
+
     def start_server(self):
         """Start the web server"""
         handler = self.create_handler()
-        self.server = HTTPServer(('localhost', self.port), handler)
+        self.server = HTTPServer(("localhost", self.port), handler)
         print(f"Web dashboard available at: http://localhost:{self.port}")
-        
+
         def run_server():
             self.server.serve_forever()
-        
+
         thread = threading.Thread(target=run_server, daemon=True)
         thread.start()
-    
+
     def create_handler(self):
         """Create HTTP request handler"""
         monitor = self.monitor
-        
+
         class DashboardHandler(BaseHTTPRequestHandler):
             def do_GET(self):
-                if self.path == '/':
+                if self.path == "/":
                     self.send_dashboard()
-                elif self.path == '/api/status':
+                elif self.path == "/api/status":
                     self.send_json(monitor.get_status_json())
-                elif self.path == '/api/tasks':
+                elif self.path == "/api/tasks":
                     self.send_json(monitor.get_tasks_json())
-                elif self.path.startswith('/api/task-report/'):
-                    task_id = self.path.split('/')[-1]
+                elif self.path.startswith("/api/task-report/"):
+                    task_id = self.path.split("/")[-1]
                     self.send_task_report(task_id)
                 else:
                     self.send_error(404)
-            
+
             def send_dashboard(self):
                 self.send_response(200)
-                self.send_header('Content-type', 'text/html')
+                self.send_header("Content-type", "text/html")
                 self.end_headers()
-                
+
                 html = self.generate_dashboard_html()
                 self.wfile.write(html.encode())
-            
+
             def send_json(self, data):
                 self.send_response(200)
-                self.send_header('Content-type', 'application/json')
+                self.send_header("Content-type", "application/json")
                 self.end_headers()
                 self.wfile.write(json.dumps(data, default=str).encode())
-            
+
             def send_task_report(self, task_id):
                 """Send task report for a specific task"""
                 try:
@@ -85,26 +83,23 @@ def send_task_report(self, task_id):
                     if not task_info:
                         self.send_error(404, "Task not found")
                         return
-                    
+
                     # Generate report using the generate_gaia_report script
                     report_content = monitor.generate_task_report(task_id)
                     if not report_content:
                         self.send_error(500, "Failed to generate report")
                         return
-                    
+
                     self.send_response(200)
-                    self.send_header('Content-type', 'text/plain; charset=utf-8')
+                    self.send_header("Content-type", "text/plain; charset=utf-8")
                     self.end_headers()
-                    self.wfile.write(report_content.encode('utf-8'))
-                    
+                    self.wfile.write(report_content.encode("utf-8"))
+
                 except Exception as e:
                     self.send_error(500, f"Error generating report: {str(e)}")
-            
+
             def generate_dashboard_html(self):
-                status = monitor.get_status_json()
-                tasks = monitor.get_tasks_json()
-                
-                return f"""
+                return """
 <!DOCTYPE html>
 <html>
 <head>
@@ -112,26 +107,26 @@ def generate_dashboard_html(self):
     <meta charset="utf-8">
     <meta name="viewport" content="width=device-width, initial-scale=1">
     <style>
-        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
-        .container {{ max-width: 1200px; margin: 0 auto; }}
-        .card {{ background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
-        .metric {{ display: inline-block; margin: 10px; padding: 15px; background: #e3f2fd; border-radius: 5px; text-align: center; }}
-        .metric-value {{ font-size: 24px; font-weight: bold; color: #1976d2; }}
-        .metric-label {{ font-size: 14px; color: #666; }}
-        .progress-bar {{ width: 100%; height: 20px; background: #e0e0e0; border-radius: 10px; overflow: hidden; }}
-        .progress-fill {{ height: 100%; background: linear-gradient(90deg, #4caf50, #8bc34a); transition: width 0.3s; }}
-        .status-running {{ color: #ff9800; }}
-        .status-completed {{ color: #4caf50; }}
-        .status-failed {{ color: #f44336; }}
-        .task-list {{ max-height: 400px; overflow-y: auto; }}
-        .task-item {{ padding: 8px; border-bottom: 1px solid #eee; }}
-        .refresh-btn {{ background: #2196f3; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }}
-        .refresh-btn:hover {{ background: #1976d2; }}
-        .view-report-btn {{ background: #4caf50; color: white; border: none; padding: 5px 10px; border-radius: 3px; cursor: pointer; margin-left: 10px; font-size: 12px; }}
-        .view-report-btn:hover {{ background: #45a049; }}
+        body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
+        .container { max-width: 1200px; margin: 0 auto; }
+        .card { background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
+        .metric { display: inline-block; margin: 10px; padding: 15px; background: #e3f2fd; border-radius: 5px; text-align: center; }
+        .metric-value { font-size: 24px; font-weight: bold; color: #1976d2; }
+        .metric-label { font-size: 14px; color: #666; }
+        .progress-bar { width: 100%; height: 20px; background: #e0e0e0; border-radius: 10px; overflow: hidden; }
+        .progress-fill { height: 100%; background: linear-gradient(90deg, #4caf50, #8bc34a); transition: width 0.3s; }
+        .status-running { color: #ff9800; }
+        .status-completed { color: #4caf50; }
+        .status-failed { color: #f44336; }
+        .task-list { max-height: 400px; overflow-y: auto; }
+        .task-item { padding: 8px; border-bottom: 1px solid #eee; }
+        .refresh-btn { background: #2196f3; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }
+        .refresh-btn:hover { background: #1976d2; }
+        .view-report-btn { background: #4caf50; color: white; border: none; padding: 5px 10px; border-radius: 3px; cursor: pointer; margin-left: 10px; font-size: 12px; }
+        .view-report-btn:hover { background: #45a049; }
     </style>
     <script>
-        function refreshData() {{
+        function refreshData() {
             fetch('/api/status')
                 .then(response => response.json())
                 .then(data => updateDashboard(data));
@@ -139,9 +134,9 @@ def generate_dashboard_html(self):
             fetch('/api/tasks')
                 .then(response => response.json())
                 .then(data => updateTaskList(data));
-        }}
+        }
         
-        function updateDashboard(data) {{
+        function updateDashboard(data) {
             document.getElementById('progress-pct').textContent = data.progress_pct.toFixed(1) + '%';
             document.getElementById('progress-fill').style.width = data.progress_pct + '%';
             document.getElementById('total-tasks').textContent = data.total_tasks;
@@ -149,29 +144,29 @@ def generate_dashboard_html(self):
             document.getElementById('running-tasks').textContent = data.running_tasks;
             document.getElementById('failed-tasks').textContent = data.failed_tasks;
             document.getElementById('accuracy').textContent = data.accuracy.toFixed(1) + '%';
-        }}
+        }
         
-        function updateTaskList(tasks) {{
+        function updateTaskList(tasks) {
             const container = document.getElementById('task-list');
             container.innerHTML = '';
-            tasks.forEach(task => {{
+            tasks.forEach(task => {
                 const div = document.createElement('div');
                 div.className = 'task-item';
-                const taskTypeDisplay = task.task_type ? `<small>${{task.task_type}}</small>` : '';
+                const taskTypeDisplay = task.task_type ? `<small>${task.task_type}</small>` : '';
                 div.innerHTML = `
-                    <strong>${{task.task_id}}</strong> - 
-                    <span class="status-${{task.status}}">${{task.status}}</span> - 
-                    ${{task.judge_result}}${{taskTypeDisplay ? ' - ' + taskTypeDisplay : ''}}
-                    <button onclick="viewTaskReport('${{task.task_id}}')" class="view-report-btn">View Report</button>
+                    <strong>${task.task_id}</strong> - 
+                    <span class="status-${task.status}">${task.status}</span> - 
+                    ${task.judge_result}${taskTypeDisplay ? ' - ' + taskTypeDisplay : ''}
+                    <button onclick="viewTaskReport('${task.task_id}')" class="view-report-btn">View Report</button>
                 `;
                 container.appendChild(div);
-            }});
-        }}
+            });
+        }
         
-        function viewTaskReport(taskId) {{
+        function viewTaskReport(taskId) {
             // Open task report in a new window
-            window.open(`/api/task-report/${{taskId}}`, '_blank');
-        }}
+            window.open(`/api/task-report/${taskId}`, '_blank');
+        }
         
         // Auto-refresh every 30 seconds
         setInterval(refreshData, 30000);
@@ -227,18 +222,18 @@ def generate_dashboard_html(self):
 </body>
 </html>
                 """
-        
+
         return DashboardHandler
 
 
 class AdvancedBenchmarkMonitor:
     """GAIA benchmark monitor with web interface"""
-    
+
     def __init__(self, log_folder: str):
         self.log_folder = Path(log_folder)
         self.start_time = datetime.now()
         # Alerts removed per user request
-        
+
         # Statistics tracking
         self.stats = {
             "total_tasks": 0,
@@ -250,34 +245,38 @@ def __init__(self, log_folder: str):
             "execution_times": [],
             "error_types": {},
             "task_types": {},
-            "last_update": None
+            "last_update": None,
         }
-        
+
         self.tasks = {}
         self.recent_activity = []
         self._generate_gaia_report_module = None
-    
+
     def _load_generate_gaia_report_module(self):
         """Lazy load the generate_gaia_report module"""
         if self._generate_gaia_report_module is None:
             import importlib.util
+
             spec = importlib.util.spec_from_file_location(
                 "generate_gaia_report",
-                os.path.join(os.path.dirname(__file__), "generate_gaia_report.py")
+                os.path.join(os.path.dirname(__file__), "generate_gaia_report.py"),
             )
             if spec is None or spec.loader is None:
                 return None
             self._generate_gaia_report_module = importlib.util.module_from_spec(spec)
             spec.loader.exec_module(self._generate_gaia_report_module)
         return self._generate_gaia_report_module
-    
+
     def scan_log_files(self) -> List[Path]:
         """Scan for all task log files"""
         if not self.log_folder.exists():
             return []
-        return sorted(self.log_folder.glob("task_*_attempt_*.json"), 
-                     key=lambda x: x.stat().st_mtime, reverse=True)
-    
+        return sorted(
+            self.log_folder.glob("task_*_attempt_*.json"),
+            key=lambda x: x.stat().st_mtime,
+            reverse=True,
+        )
+
     def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
         """Parse a single task log file"""
         try:
@@ -285,28 +284,30 @@ def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
                 return json.load(f)
         except (json.JSONDecodeError, FileNotFoundError, KeyError):
             return None
-    
-    def extract_task_info(self, data: Dict[str, Any], file_path: Path) -> Dict[str, Any]:
+
+    def extract_task_info(
+        self, data: Dict[str, Any], file_path: Path
+    ) -> Dict[str, Any]:
         """Extract relevant information from task data"""
         task_id = data.get("task_id", "unknown")
         status = data.get("status", "unknown").lower()
         judge_result = data.get("judge_result", "").upper()
         final_answer = data.get("final_boxed_answer", "")
         error_msg = data.get("error", "")
-        
+
         # Extract execution time
         start_time = data.get("start_time")
         end_time = data.get("end_time")
         execution_time = None
-        
+
         if start_time and end_time:
             try:
-                start_dt = datetime.fromisoformat(start_time.replace('Z', '+00:00'))
-                end_dt = datetime.fromisoformat(end_time.replace('Z', '+00:00'))
+                start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
+                end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
                 execution_time = (end_dt - start_dt).total_seconds()
-            except:
+            except Exception:
                 pass
-        
+
         # Extract task type from metadata
         task_type = ""
         metadata = data.get("metadata", {})
@@ -320,7 +321,7 @@ def extract_task_info(self, data: Dict[str, Any], file_path: Path) -> Dict[str,
                 task_type = str(metadata["type"])
             elif "difficulty" in metadata:
                 task_type = f"Difficulty {metadata['difficulty']}"
-        
+
         return {
             "task_id": task_id,
             "file_path": str(file_path),
@@ -330,17 +331,16 @@ def extract_task_info(self, data: Dict[str, Any], file_path: Path) -> Dict[str,
             "error": error_msg,
             "execution_time": execution_time,
             "task_type": task_type,
-            "last_modified": file_path.stat().st_mtime
+            "last_modified": file_path.stat().st_mtime,
         }
-    
+
     def update_statistics(self, task_info: Dict[str, Any]):
         """Update monitoring statistics and check for alerts"""
         task_id = task_info["task_id"]
         status = task_info["status"]
         judge_result = task_info["judge_result"]
         execution_time = task_info["execution_time"]
-        task_type = task_info["task_type"]
-        
+
         # Update task tracking
         if task_id not in self.tasks:
             self.tasks[task_id] = task_info
@@ -360,13 +360,15 @@ def update_statistics(self, task_info: Dict[str, Any]):
             # Update existing task - only update if status changed
             old_status = self.tasks[task_id]["status"]
             if old_status != status:
-                self.recent_activity.append({
-                    "task_id": task_id,
-                    "old_status": old_status,
-                    "new_status": status,
-                    "timestamp": datetime.now()
-                })
-                
+                self.recent_activity.append(
+                    {
+                        "task_id": task_id,
+                        "old_status": old_status,
+                        "new_status": status,
+                        "timestamp": datetime.now(),
+                    }
+                )
+
                 # Decrease old status count
                 if old_status == "completed":
                     self.stats["completed_tasks"] -= 1
@@ -379,7 +381,7 @@ def update_statistics(self, task_info: Dict[str, Any]):
                     self.stats["running_tasks"] -= 1
                 elif old_status in ["failed", "error", "interrupted"]:
                     self.stats["failed_tasks"] -= 1
-                
+
                 # Increase new status count
                 if status == "completed":
                     self.stats["completed_tasks"] += 1
@@ -391,36 +393,40 @@ def update_statistics(self, task_info: Dict[str, Any]):
                     self.stats["running_tasks"] += 1
                 elif status in ["failed", "error", "interrupted"]:
                     self.stats["failed_tasks"] += 1
-                
+
             self.tasks[task_id] = task_info
-        
+
         # Track execution times
         if execution_time is not None:
             self.stats["execution_times"].append(execution_time)
             if len(self.stats["execution_times"]) > 100:
                 self.stats["execution_times"] = self.stats["execution_times"][-100:]
-        
+
         # Alerts removed; no checks performed
-    
+
     def get_status_json(self) -> Dict[str, Any]:
         """Get current status as JSON for web interface"""
         total = self.stats["total_tasks"]
         completed = self.stats["completed_tasks"]
         running = self.stats["running_tasks"]
         failed = self.stats["failed_tasks"]
-        
+
         progress_pct = (completed / total * 100) if total > 0 else 0
         progress_pct = min(progress_pct, 100.0)  # Cap at 100%
-        
+
         total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"]
-        accuracy = (self.stats["correct_answers"] / total_judged * 100) if total_judged > 0 else 0
-        
+        accuracy = (
+            (self.stats["correct_answers"] / total_judged * 100)
+            if total_judged > 0
+            else 0
+        )
+
         exec_times = self.stats["execution_times"]
         avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0
-        
+
         elapsed_time = (datetime.now() - self.start_time).total_seconds()
         tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0
-        
+
         return {
             "total_tasks": total,
             "completed_tasks": completed,
@@ -430,9 +436,11 @@ def get_status_json(self) -> Dict[str, Any]:
             "accuracy": accuracy,
             "avg_execution_time": avg_execution_time,
             "tasks_per_second": tasks_per_second,
-            "last_update": self.stats["last_update"].isoformat() if self.stats["last_update"] else None
+            "last_update": self.stats["last_update"].isoformat()
+            if self.stats["last_update"]
+            else None,
         }
-    
+
     def get_tasks_json(self) -> List[Dict[str, Any]]:
         """Get tasks list as JSON for web interface"""
         return [
@@ -441,27 +449,29 @@ def get_tasks_json(self) -> List[Dict[str, Any]]:
                 "status": task_info["status"],
                 "judge_result": task_info["judge_result"],
                 "task_type": task_info["task_type"],
-                "execution_time": task_info["execution_time"]
+                "execution_time": task_info["execution_time"],
             }
-            for task_info in sorted(self.tasks.values(), key=lambda x: x["last_modified"], reverse=True)
+            for task_info in sorted(
+                self.tasks.values(), key=lambda x: x["last_modified"], reverse=True
+            )
         ]
-    
+
     def scan_and_update(self):
         """Scan log files and update statistics"""
         log_files = self.scan_log_files()
-        
+
         for file_path in log_files:
             data = self.parse_task_file(file_path)
             if data:
                 task_info = self.extract_task_info(data, file_path)
                 self.update_statistics(task_info)
-        
+
         self.stats["last_update"] = datetime.now()
-    
+
     def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]:
         """Get information about a specific task"""
         return self.tasks.get(task_id)
-    
+
     def generate_task_report(self, task_id: str) -> Optional[str]:
         """Generate the original simple report (no execution details)."""
         try:
@@ -479,15 +489,14 @@ def generate_task_report(self, task_id: str) -> Optional[str]:
             # Generate and return the plain report content
             report_path = generate_task_report(task_index)
             if report_path and os.path.exists(report_path):
-                with open(report_path, 'r', encoding='utf-8') as f:
+                with open(report_path, "r", encoding="utf-8") as f:
                     return f.read()
             return None
 
         except Exception as e:
             print(f"Error generating simple report for task {task_id}: {e}")
             return None
-    
-    
+
     def find_task_index_in_dataset(self, task_id: str) -> Optional[int]:
         """Find the index of a task in the GAIA dataset"""
         try:
@@ -496,17 +505,17 @@ def find_task_index_in_dataset(self, task_id: str) -> Optional[int]:
             if generate_module is None:
                 return None
             load_gaia_data = generate_module.load_gaia_data
-            
+
             # Load GAIA data
             tasks = load_gaia_data()
-            
+
             # Find the task by ID
             for i, task in enumerate(tasks):
-                if task.get('task_id') == task_id:
+                if task.get("task_id") == task_id:
                     return i
-            
+
             return None
-            
+
         except Exception as e:
             print(f"Error finding task {task_id} in dataset: {e}")
             return None
@@ -517,31 +526,31 @@ def main():
     parser.add_argument("log_folder", nargs="?", default=".", help="Path to log folder")
     parser.add_argument("--web-port", type=int, default=8080, help="Web interface port")
     # Alert functionality removed; threshold flag no longer supported
-    
+
     args = parser.parse_args()
-    
+
     if not Path(args.log_folder).exists():
         print(f"Error: Log folder not found: {args.log_folder}")
         return 1
-    
+
     # Create monitor
     monitor = AdvancedBenchmarkMonitor(args.log_folder)
-    
+
     # Start web dashboard
     dashboard = WebDashboard(monitor, args.web_port)
     dashboard.start_server()
-    
+
     print("GAIA Benchmark Monitor started")
     print(f"Web dashboard: http://localhost:{args.web_port}")
     print("Press Ctrl+C to stop")
-    
+
     try:
         while True:
             monitor.scan_and_update()
             time.sleep(30)  # Update every 30 seconds
     except KeyboardInterrupt:
         print("\nMonitor stopped by user")
-    
+
     return 0
 
 
diff --git a/utils/progress_check/generate_gaia_report.py b/utils/progress_check/generate_gaia_report.py
index ce1651e..5be2d19 100644
--- a/utils/progress_check/generate_gaia_report.py
+++ b/utils/progress_check/generate_gaia_report.py
@@ -8,7 +8,6 @@
 import json
 import os
 import sys
-from datetime import datetime
 
 
 def find_gaia_data_dir():
@@ -17,12 +16,16 @@ def find_gaia_data_dir():
     script_dir = os.path.dirname(os.path.abspath(__file__))
     # Project root is two levels up from utils/progress_check/
     repo_root = os.path.abspath(os.path.join(script_dir, "..", ".."))
-    
+
     # Try common locations
     possible_paths = [
         os.path.join(repo_root, "data", "gaia-val"),  # Project root/data/gaia-val
-        os.path.join(script_dir, "..", "data", "gaia-val"),  # utils/data/gaia-val (unlikely)
-        os.path.join(script_dir, "data", "gaia-val"),  # utils/progress_check/data/gaia-val (unlikely)
+        os.path.join(
+            script_dir, "..", "data", "gaia-val"
+        ),  # utils/data/gaia-val (unlikely)
+        os.path.join(
+            script_dir, "data", "gaia-val"
+        ),  # utils/progress_check/data/gaia-val (unlikely)
         "data/gaia-val",  # Relative from current working directory
     ]
 
@@ -52,7 +55,7 @@ def load_gaia_data(data_dir=None):
         sys.exit(1)
 
     tasks = []
-    with open(jsonl_path, 'r', encoding='utf-8') as f:
+    with open(jsonl_path, "r", encoding="utf-8") as f:
         for line in f:
             if line.strip():
                 tasks.append(json.loads(line))
@@ -69,13 +72,15 @@ def _default_reports_dir() -> str:
 
 def generate_task_report(task_index, data_dir=None, output_dir=None):
     """Generate detailed text report for specified task"""
-    print(f"🚀 Loading GAIA dataset...")
+    print("🚀 Loading GAIA dataset...")
     tasks = load_gaia_data(data_dir)
 
     display_index = task_index + 1
 
     if task_index >= len(tasks):
-        print(f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks")
+        print(
+            f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks"
+        )
         return None
 
     print(f"📄 Generating task {display_index} report...")
@@ -91,9 +96,9 @@ def generate_task_report(task_index, data_dir=None, output_dir=None):
     os.makedirs(output_dir, exist_ok=True)
 
     # Generate report file
-    report_path = os.path.join(output_dir, f'gaia_task_{display_index}_report.txt')
+    report_path = os.path.join(output_dir, f"gaia_task_{display_index}_report.txt")
 
-    with open(report_path, 'w', encoding='utf-8') as f:
+    with open(report_path, "w", encoding="utf-8") as f:
         f.write("=" * 80 + "\n")
         f.write(f"GAIA Dataset Task {display_index} Detailed Report\n")
         f.write("=" * 80 + "\n\n")
@@ -104,7 +109,7 @@ def generate_task_report(task_index, data_dir=None, output_dir=None):
         f.write(f"Task ID: {task['task_id']}\n")
         f.write(f"Difficulty Level: Level {task['metadata']['Level']}\n")
         f.write(f"File Attachment: {'Yes' if task.get('file_path') else 'No'}\n")
-        if task.get('file_path'):
+        if task.get("file_path"):
             f.write(f"File Path: {task['file_path']}\n")
         f.write("\n")
 
@@ -126,10 +131,10 @@ def generate_task_report(task_index, data_dir=None, output_dir=None):
         # Metadata
         f.write("5. Task Metadata\n")
         f.write("-" * 40 + "\n")
-        metadata = task['metadata']['Annotator Metadata']
+        metadata = task["metadata"]["Annotator Metadata"]
         for key, value in metadata.items():
-            if key != 'Steps':  # Skip Steps since it's shown in section 4
-                if key == 'Tools':
+            if key != "Steps":  # Skip Steps since it's shown in section 4
+                if key == "Tools":
                     f.write(f"{key}:\n{value}\n\n")
                 else:
                     f.write(f"{key}: {value}\n\n")
@@ -148,13 +153,26 @@ def main():
     """Main function"""
     import argparse
 
-    parser = argparse.ArgumentParser(description='Generate GAIA dataset task reports')
-    parser.add_argument('task_index', nargs='?', type=int, default=1,
-                       help='Task index to generate report for (1-based, default: 1)')
-    parser.add_argument('--data-dir', type=str, default=None,
-                       help='Path to GAIA data directory (auto-detected if not specified)')
-    parser.add_argument('--output-dir', type=str, default=None,
-                       help='Output directory for reports (default: <repo_root>/gaia_reports)')
+    parser = argparse.ArgumentParser(description="Generate GAIA dataset task reports")
+    parser.add_argument(
+        "task_index",
+        nargs="?",
+        type=int,
+        default=1,
+        help="Task index to generate report for (1-based, default: 1)",
+    )
+    parser.add_argument(
+        "--data-dir",
+        type=str,
+        default=None,
+        help="Path to GAIA data directory (auto-detected if not specified)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory for reports (default: <repo_root>/gaia_reports)",
+    )
 
     args = parser.parse_args()
 
@@ -165,4 +183,3 @@ def main():
 
 if __name__ == "__main__":
     main()
-

From 5a05241566a6bfc8167ac13b6ffb6b3fb6e06f62 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Mon, 10 Nov 2025 22:31:27 +0000
Subject: [PATCH 08/11] generalize monitoring system to support all benchmarks

---
 monitor_guide.md                              |   75 ++
 run_benchmark_with_monitor.py                 |  132 +++
 utils/progress_check/benchmark_monitor.py     | 1008 +++++++++++++++++
 .../generate_benchmark_report.py              |  249 ++++
 4 files changed, 1464 insertions(+)
 create mode 100644 monitor_guide.md
 create mode 100644 run_benchmark_with_monitor.py
 create mode 100644 utils/progress_check/benchmark_monitor.py
 create mode 100644 utils/progress_check/generate_benchmark_report.py

diff --git a/monitor_guide.md b/monitor_guide.md
new file mode 100644
index 0000000..5961bb5
--- /dev/null
+++ b/monitor_guide.md
@@ -0,0 +1,75 @@
+# Web Monitoring Guide for Benchmark Evaluation
+
+This document provides guidance for using the web monitoring dashboard while evaluating benchmarks with MiroFlow.
+
+## Overview
+
+The web monitoring system provides real-time progress tracking, statistics, and task reports through a web interface. It runs alongside the benchmark evaluation process.
+
+## Architecture
+
+```txt
+run_benchmark_with_monitor.py (Wrapper)
+  ├─> Process 1: common_benchmark.py (Executor)
+  │    └─> Executes tasks and generates log files
+  │
+  └─> Process 2: benchmark_monitor.py (Monitor)
+       └─> Reads log files and displays monitoring interface
+       └─> Generates task reports via generate_benchmark_report.py
+```
+
+## Features
+
+- **Real-time Dashboard**: Monitor progress, statistics, and task status in real-time
+- **Web Interface**: Access dashboard at `http://localhost:8080` (or next available port)
+- **Task Reports**: View detailed reports for individual tasks
+- **Benchmark-Specific Metrics**: Tailored statistics for different benchmark types (GAIA, FutureX, FinSearchComp, xBench)
+- **Auto-refresh**: Dashboard updates automatically every 30 seconds
+
+## Supported Benchmarks
+
+`run_benchmark_with_monitor.py` currently supports the following benchmark evaluations:
+
+- **GAIA Validation**
+- **FutureX**
+- **FinSearchComp**
+- **xBench-DeepSearch**
+
+## Usage Examples
+
+#### GAIA Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+  --config_file_name=agent_gaia-validation_claude37sonnet \
+  --output_dir="logs/gaia-validation-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+#### FutureX Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+  --config_file_name=agent_quickstart_reading \
+  benchmark=futurex \
+  --output_dir="logs/futurex/$(date +"%Y%m%d_%H%M")"
+```
+
+#### FinSearchComp Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+  --config_file_name=agent_finsearchcomp_claude37sonnet \
+  --output_dir="logs/finsearchcomp-claude37sonnet/$(date +"%Y%m%d_%H%M")"
+```
+
+#### xBench-DeepSearch Benchmark
+
+```bash
+uv run main.py run-benchmark-with-monitor \
+  --config_file_name=agent_xbench-ds_claude37sonnet \
+  benchmark=xbench-ds \
+  --output_dir="logs/xbench-ds/$(date +"%Y%m%d_%H%M")"
+```
+
+💡 To resume an interrupted evaluation, simply replace the output directory with an existing log directory.
+
diff --git a/run_benchmark_with_monitor.py b/run_benchmark_with_monitor.py
new file mode 100644
index 0000000..edc1da9
--- /dev/null
+++ b/run_benchmark_with_monitor.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2025 MiromindAI
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import subprocess
+import signal
+import sys
+import time
+from typing import Optional
+
+
+def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
+    """Run benchmark with integrated web monitoring"""
+
+    # Validate required arguments
+    if not output_dir:
+        print("Error: output_dir is required")
+        print(
+            "Usage: uv run main.py run-benchmark-with-monitor --config_file_name=name --output_dir=path"
+        )
+        return 1
+
+    # Create output directory if it doesn't exist
+    os.makedirs(output_dir, exist_ok=True)
+
+    print("=" * 50)
+    print("Benchmark Runner with Monitor")
+    print("=" * 50)
+    print(f"Output directory: {output_dir}")
+    print(f"Config name: {config_file_name}")
+    print(f"Web port: {web_port}")
+    print("=" * 50)
+
+    # Global variables for process management
+    benchmark_process: Optional[subprocess.Popen] = None
+    monitor_process: Optional[subprocess.Popen] = None
+
+    def cleanup_processes():
+        """Clean up running processes"""
+        print("\nShutting down processes...")
+
+        if benchmark_process and benchmark_process.poll() is None:
+            print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
+            benchmark_process.terminate()
+            try:
+                benchmark_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                benchmark_process.kill()
+
+        if monitor_process and monitor_process.poll() is None:
+            print(f"Stopping monitor (PID: {monitor_process.pid})...")
+            monitor_process.terminate()
+            try:
+                monitor_process.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                monitor_process.kill()
+
+        print("Cleanup complete.")
+
+    def signal_handler(signum, frame):
+        """Handle Ctrl+C gracefully"""
+        cleanup_processes()
+        sys.exit(0)
+
+    # Set up signal handlers
+    signal.signal(signal.SIGINT, signal_handler)
+    signal.signal(signal.SIGTERM, signal_handler)
+
+    try:
+        # Start benchmark
+        print("Starting benchmark...")
+        benchmark_cmd = [
+            "uv",
+            "run",
+            "main.py",
+            "common-benchmark",
+            f"--config_file_name={config_file_name}",
+            f"output_dir={output_dir}",
+        ]
+        # Add any additional arguments (e.g., benchmark=futurex)
+        benchmark_cmd.extend(list(args))
+        benchmark_process = subprocess.Popen(benchmark_cmd)
+        print(f"Benchmark started with PID: {benchmark_process.pid}")
+
+        # Wait a moment for benchmark to initialize
+        time.sleep(3)
+
+        # Start monitor
+        print("Starting web monitor...")
+        monitor_cmd = [
+            "uv",
+            "run",
+            "utils/progress_check/benchmark_monitor.py",
+            output_dir,
+            f"--web-port={web_port}",
+        ]
+        monitor_process = subprocess.Popen(monitor_cmd)
+        print(f"Monitor started with PID: {monitor_process.pid}")
+        print(f"Web dashboard available at: http://localhost:{web_port}")
+
+        print("\n" + "=" * 50)
+        print("Both processes are running!")
+        print("Press Ctrl+C to stop both processes")
+        print("Monitor will continue running even if benchmark finishes")
+        print("=" * 50)
+
+        # Monitor the processes
+        while True:
+            time.sleep(5)
+
+            # Check if benchmark process is still running
+            if benchmark_process and benchmark_process.poll() is not None:
+                print("Benchmark process ended")
+                benchmark_process = None
+
+            # Check if monitor process is still running
+            if monitor_process and monitor_process.poll() is not None:
+                print("Monitor process died unexpectedly. Restarting...")
+                monitor_process = subprocess.Popen(monitor_cmd)
+                print(f"Monitor restarted with PID: {monitor_process.pid}")
+
+    except KeyboardInterrupt:
+        cleanup_processes()
+
+    return 0
+
+
+if __name__ == "__main__":
+    import fire
+
+    fire.Fire(main)
diff --git a/utils/progress_check/benchmark_monitor.py b/utils/progress_check/benchmark_monitor.py
new file mode 100644
index 0000000..8a106af
--- /dev/null
+++ b/utils/progress_check/benchmark_monitor.py
@@ -0,0 +1,1008 @@
+"""
+Benchmark Monitor with Web Interface
+
+This script provides monitoring capabilities for any benchmark including:
+- Real-time web dashboard
+- Historical data tracking
+- Progress monitoring
+
+Usage:
+    uv run utils/progress_check/benchmark_monitor.py [LOG_FOLDER_PATH] [OPTIONS]
+
+Options:
+    --web-port PORT       Web interface port (default: 8080)
+"""
+
+import json
+import time
+import argparse
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+import threading
+import os
+import socket
+import re
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from omegaconf import OmegaConf
+
+
+class WebDashboard:
+    """Simple web dashboard for monitoring"""
+
+    def __init__(self, monitor, port: int = 8080):
+        self.monitor = monitor
+        self.port = port
+        self.server = None
+        self.benchmark_name = monitor.benchmark_name
+
+    def _is_port_available(self, port: int) -> bool:
+        """Check if a port is available"""
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind(("localhost", port))
+                return True
+            except OSError:
+                return False
+
+    def _find_available_port(self, start_port: int, max_attempts: int = 100) -> int:
+        """Find an available port starting from start_port"""
+        port = start_port
+        for _ in range(max_attempts):
+            if self._is_port_available(port):
+                return port
+            port += 1
+        raise RuntimeError(f"Could not find an available port after {max_attempts} attempts")
+
+    def start_server(self):
+        """Start the web server with automatic port conflict resolution"""
+        handler = self.create_handler()
+        
+        # Find an available port, starting from the requested port
+        actual_port = self._find_available_port(self.port)
+        
+        # Update self.port to reflect the actual port being used
+        if actual_port != self.port:
+            print(f"Port {self.port} is already in use, using port {actual_port} instead")
+            self.port = actual_port
+        
+        self.server = HTTPServer(("localhost", self.port), handler)
+        print(f"Web dashboard available at: http://localhost:{self.port}")
+
+        def run_server():
+            self.server.serve_forever()
+
+        thread = threading.Thread(target=run_server, daemon=True)
+        thread.start()
+
+    def create_handler(self):
+        """Create HTTP request handler"""
+        monitor = self.monitor
+
+        class DashboardHandler(BaseHTTPRequestHandler):
+            def log_message(self, format, *args):
+                """Override to suppress HTTP request logs"""
+                pass
+            
+            def do_GET(self):
+                if self.path == "/":
+                    self.send_dashboard()
+                elif self.path == "/api/status":
+                    self.send_json(monitor.get_status_json())
+                elif self.path == "/api/tasks":
+                    self.send_json(monitor.get_tasks_json())
+                elif self.path.startswith("/api/task-report/"):
+                    task_id = self.path.split("/")[-1]
+                    self.send_task_report(task_id)
+                else:
+                    self.send_error(404)
+
+            def send_dashboard(self):
+                self.send_response(200)
+                self.send_header("Content-type", "text/html")
+                self.end_headers()
+
+                html = self.generate_dashboard_html()
+                self.wfile.write(html.encode())
+
+            def send_json(self, data):
+                self.send_response(200)
+                self.send_header("Content-type", "application/json")
+                self.end_headers()
+                self.wfile.write(json.dumps(data, default=str).encode())
+
+            def send_task_report(self, task_id):
+                """Send task report for a specific task"""
+                try:
+                    # Try to find the task in the current running tasks
+                    task_info = monitor.get_task_info(task_id)
+                    if not task_info:
+                        self.send_error(404, "Task not found")
+                        return
+
+                    # Generate report using the standalone report generator
+                    report_content = monitor.generate_task_report(task_id)
+                    if not report_content:
+                        self.send_error(500, "Failed to generate report")
+                        return
+
+                    self.send_response(200)
+                    self.send_header("Content-type", "text/plain; charset=utf-8")
+                    self.end_headers()
+                    self.wfile.write(report_content.encode("utf-8"))
+
+                except Exception as e:
+                    self.send_error(500, f"Error generating report: {str(e)}")
+
+            def generate_dashboard_html(self):
+                benchmark_name = monitor.benchmark_name
+                return f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <title>{benchmark_name} Monitor Dashboard</title>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }}
+        .container {{ max-width: 1200px; margin: 0 auto; }}
+        .card {{ background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
+        .metric {{ display: inline-block; margin: 10px; padding: 15px; background: #e3f2fd; border-radius: 5px; text-align: center; }}
+        .metric-value {{ font-size: 24px; font-weight: bold; color: #1976d2; }}
+        .metric-label {{ font-size: 14px; color: #666; }}
+        .progress-bar {{ width: 100%; height: 20px; background: #e0e0e0; border-radius: 10px; overflow: hidden; }}
+        .progress-fill {{ height: 100%; background: linear-gradient(90deg, #4caf50, #8bc34a); transition: width 0.3s; }}
+        .status-running {{ color: #ff9800; }}
+        .status-completed {{ color: #4caf50; }}
+        .status-failed {{ color: #f44336; }}
+        .task-list {{ max-height: 400px; overflow-y: auto; }}
+        .task-item {{ padding: 8px; border-bottom: 1px solid #eee; }}
+        .refresh-btn {{ background: #2196f3; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }}
+        .refresh-btn:hover {{ background: #1976d2; }}
+        .view-report-btn {{ background: #4caf50; color: white; border: none; padding: 5px 10px; border-radius: 3px; cursor: pointer; margin-left: 10px; font-size: 12px; }}
+        .view-report-btn:hover {{ background: #45a049; }}
+    </style>
+    <script>
+        function refreshData() {{
+            fetch('/api/status')
+                .then(response => response.json())
+                .then(data => {{
+                    updateDashboard(data);
+                    // Pass benchmark_type to updateTaskList
+                    fetch('/api/tasks')
+                        .then(response => response.json())
+                        .then(tasks => updateTaskList(tasks, data.benchmark_type));
+                }});
+        }}
+        
+        function updateDashboard(data) {{
+            document.getElementById('progress-pct').textContent = data.progress_pct.toFixed(1) + '%';
+            document.getElementById('progress-fill').style.width = data.progress_pct + '%';
+            document.getElementById('total-tasks').textContent = data.total_tasks;
+            document.getElementById('completed-tasks').textContent = data.completed_tasks;
+            document.getElementById('running-tasks').textContent = data.running_tasks;
+            document.getElementById('failed-tasks').textContent = data.failed_tasks;
+            
+            // Update metrics based on benchmark type
+            const benchmarkType = data.benchmark_type || 'default';
+            const metricsContainer = document.getElementById('key-metrics');
+            
+            if (benchmarkType === 'gaia' || benchmarkType === 'default') {{
+                // GAIA/Default: Show correctness metrics
+                updateMetric('correct-answers', data.correct_answers || 0, 'Correct');
+                updateMetric('incorrect-answers', data.incorrect_answers || 0, 'Incorrect');
+                updateMetric('accuracy', (data.accuracy || 0).toFixed(1) + '%', 'Accuracy');
+            }} else if (benchmarkType === 'futurex' || benchmarkType === 'xbench') {{
+                // FutureX/xbench: Show prediction metrics
+                updateMetric('with-predictions', data.with_predictions || 0, 'With Predictions');
+                updateMetric('without-predictions', data.without_predictions || 0, 'Without Predictions');
+                updateMetric('prediction-rate', (data.prediction_rate || 0).toFixed(1) + '%', 'Prediction Rate');
+            }} else if (benchmarkType === 'finsearchcomp') {{
+                // FinSearchComp: Show T2+T3 accuracy and task breakdown
+                updateMetric('correct-answers', data.correct_answers || 0, 'Correct (T2+T3)');
+                updateMetric('incorrect-answers', data.incorrect_answers || 0, 'Incorrect (T2+T3)');
+                updateMetric('accuracy', (data.accuracy || 0).toFixed(1) + '%', 'Accuracy (T2+T3)');
+                if (data.task_type_breakdown) {{
+                    updateMetric('t1-completed', data.t1_completed || 0, 'T1 Completed');
+                }}
+            }}
+        }}
+        
+        function updateMetric(id, value, label) {{
+            let metricDiv = document.getElementById(id);
+            if (!metricDiv) {{
+                // Create metric if it doesn't exist
+                const container = document.getElementById('key-metrics');
+                metricDiv = document.createElement('div');
+                metricDiv.className = 'metric';
+                metricDiv.id = id;
+                metricDiv.innerHTML = `
+                    <div class="metric-value" id="${{id}}-value">0</div>
+                    <div class="metric-label" id="${{id}}-label">${{label}}</div>
+                `;
+                container.appendChild(metricDiv);
+            }}
+            document.getElementById(id + '-value').textContent = value;
+            document.getElementById(id + '-label').textContent = label;
+        }}
+        
+        function updateTaskList(tasks, benchmarkType) {{
+            const container = document.getElementById('task-list');
+            container.innerHTML = '';
+            tasks.forEach(task => {{
+                const div = document.createElement('div');
+                div.className = 'task-item';
+                const taskTypeDisplay = task.task_type ? `<small>${{task.task_type}}</small>` : '';
+                
+                // Tailor display logic for each benchmark type (like check_*_progress.py)
+                let judgeDisplay = '';
+                if (benchmarkType === 'futurex' || benchmarkType === 'xbench') {{
+                    // FutureX/xbench: Don't show judge_result, only show prediction status (like check_futurex_progress.py, check_xbench_progress.py)
+                    if (task.final_answer && task.final_answer.trim()) {{
+                        judgeDisplay = ' - Has Prediction';
+                    }}
+                }} else if (benchmarkType === 'gaia') {{
+                    // GAIA: Show all judge_result (CORRECT/INCORRECT/other) (like check_gaia_progress.py)
+                    if (task.judge_result && task.judge_result !== 'N/A') {{
+                        judgeDisplay = ` - ${{task.judge_result}}`;
+                    }}
+                }} else if (benchmarkType === 'finsearchcomp') {{
+                    // FinSearchComp: Show all judge_result including NOT_ATTEMPTED (like check_finsearchcomp_progress.py)
+                    // Note: T1 tasks don't have judge_result, T2/T3 show judge_result
+                    if (task.judge_result && task.judge_result !== 'N/A') {{
+                        judgeDisplay = ` - ${{task.judge_result}}`;
+                    }}
+                }} else {{
+                    // Default: Show judge_result if available
+                    if (task.judge_result && task.judge_result !== 'N/A') {{
+                        judgeDisplay = ` - ${{task.judge_result}}`;
+                    }}
+                }}
+                
+                div.innerHTML = `
+                    <strong>${{task.task_id}}</strong> - 
+                    <span class="status-${{task.status}}">${{task.status}}</span>${{judgeDisplay}}${{taskTypeDisplay ? ' - ' + taskTypeDisplay : ''}}
+                    <button onclick="viewTaskReport('${{task.task_id}}')" class="view-report-btn">View Report</button>
+                `;
+                container.appendChild(div);
+            }});
+        }}
+        
+        function viewTaskReport(taskId) {{
+            // Open task report in a new window
+            window.open(`/api/task-report/${{taskId}}`, '_blank');
+        }}
+        
+        // Auto-refresh every 30 seconds
+        setInterval(refreshData, 30000);
+        
+        // Initial load
+        window.onload = refreshData;
+    </script>
+</head>
+<body>
+    <div class="container">
+        <h1>{benchmark_name} Monitor Dashboard</h1>
+        
+        <div class="card">
+            <h2>Overall Progress</h2>
+            <div class="progress-bar">
+                <div class="progress-fill" id="progress-fill" style="width: 0%"></div>
+            </div>
+            <p>Progress: <span id="progress-pct">0%</span></p>
+        </div>
+        
+        <div class="card">
+            <h2>Key Metrics</h2>
+            <div id="key-metrics">
+                <div class="metric">
+                    <div class="metric-value" id="total-tasks">0</div>
+                    <div class="metric-label">Total Tasks</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-value" id="completed-tasks">0</div>
+                    <div class="metric-label">Completed</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-value" id="running-tasks">0</div>
+                    <div class="metric-label">Running</div>
+                </div>
+                <div class="metric">
+                    <div class="metric-value" id="failed-tasks">0</div>
+                    <div class="metric-label">Failed</div>
+                </div>
+            </div>
+        </div>
+        
+        <div class="card">
+            <h2>Recent Tasks</h2>
+            <button class="refresh-btn" onclick="refreshData()">Refresh</button>
+            <div class="task-list" id="task-list">
+                Loading...
+            </div>
+        </div>
+    </div>
+</body>
+</html>
+                """
+
+        return DashboardHandler
+
+
+class BenchmarkMonitor:
+    """Generic benchmark monitor with web interface"""
+
+    def __init__(self, log_folder: str):
+        self.log_folder = Path(log_folder)
+        self.start_time = datetime.now()
+        self.benchmark_name = self._detect_benchmark_name()
+        self.benchmark_type = self._detect_benchmark_type()
+
+        # Initialize statistics based on benchmark type
+        self.stats = self._initialize_stats()
+
+        self.tasks = {}
+        self.recent_activity = []
+
+    def _detect_benchmark_name(self) -> str:
+        """Detect benchmark name from log folder path or config file"""
+        # Try to get from .hydra/config.yaml first
+        hydra_config_path = self.log_folder / ".hydra" / "config.yaml"
+        if hydra_config_path.exists():
+            try:
+                cfg = OmegaConf.load(hydra_config_path)
+                benchmark_name = cfg.get("benchmark", {}).get("name", "")
+                if benchmark_name:
+                    return self._format_benchmark_name(benchmark_name)
+            except Exception:
+                pass
+        
+        # Try to extract from path (e.g., logs/gaia/... -> GAIA)
+        path_parts = self.log_folder.parts
+        if "logs" in path_parts:
+            idx = path_parts.index("logs")
+            if idx + 1 < len(path_parts):
+                benchmark_name = path_parts[idx + 1]
+                return self._format_benchmark_name(benchmark_name)
+        
+        # Default fallback
+        return "Benchmark"
+    
+    def _format_benchmark_name(self, name: str) -> str:
+        """Format benchmark name to a friendly display format"""
+        name_lower = name.lower().replace("-", "").replace("_", "")
+        
+        # Map common benchmark names to their preferred display format
+        name_mapping = {
+            "finsearchcomp": "FinSearchComp",
+            "futurex": "FutureX",
+            "future-x": "FutureX",
+            "gaia": "GAIA",
+            "xbench": "xbench",
+            "x-bench": "xbench",
+            "browsecomp": "BrowseComp",
+            "browsecomp-zh": "BrowseComp-ZH",
+        }
+        
+        # Check exact match first
+        if name_lower in name_mapping:
+            return name_mapping[name_lower]
+        
+        # Check partial match (e.g., "finsearchcomp-claude" -> "FinSearchComp")
+        for key, value in name_mapping.items():
+            if name_lower.startswith(key):
+                return value
+        
+        # Default: convert to title case (e.g., "example_dataset" -> "Example Dataset")
+        return name.replace("-", " ").replace("_", " ").title()
+
+    def _detect_benchmark_type(self) -> str:
+        """Detect benchmark type to determine statistics logic"""
+        name_lower = self.benchmark_name.lower()
+        
+        if "gaia" in name_lower:
+            return "gaia"  # Has ground truth, needs correctness evaluation
+        elif "futurex" in name_lower or "future-x" in name_lower:
+            return "futurex"  # No ground truth, prediction-focused
+        elif "xbench" in name_lower or "x-bench" in name_lower:
+            return "xbench"  # No ground truth, prediction-focused
+        elif "finsearchcomp" in name_lower or "finsearch-comp" in name_lower:
+            return "finsearchcomp"  # Has ground truth, needs task type breakdown
+        else:
+            return "default"  # Default: assume has ground truth
+
+    def _initialize_stats(self) -> Dict[str, Any]:
+        """Initialize statistics based on benchmark type"""
+        base_stats = {
+            "total_tasks": 0,
+            "completed_tasks": 0,
+            "running_tasks": 0,
+            "failed_tasks": 0,
+            "execution_times": [],
+            "error_types": {},
+            "task_types": {},
+            "last_update": None,
+        }
+        
+        if self.benchmark_type == "gaia":
+            # GAIA: correctness evaluation
+            base_stats.update({
+                "correct_answers": 0,
+                "incorrect_answers": 0,
+            })
+        elif self.benchmark_type in ["futurex", "xbench"]:
+            # FutureX/xbench: prediction-focused
+            base_stats.update({
+                "with_predictions": 0,
+                "without_predictions": 0,
+                "with_errors": 0,
+            })
+        elif self.benchmark_type == "finsearchcomp":
+            # FinSearchComp: task type and regional breakdown (like check_finsearchcomp_progress.py)
+            base_stats.update({
+                "correct_answers": 0,  # T2+T3 only
+                "incorrect_answers": 0,  # T2+T3 only
+                "task_type_breakdown": {
+                    "T1": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                    "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                    "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                    "Unknown": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                },
+                "regional_breakdown": {
+                    "Global": {
+                        "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                        "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                    },
+                    "Greater China": {
+                        "T2": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                        "T3": {"total": 0, "completed": 0, "correct": 0, "incorrect": 0},
+                    },
+                },
+            })
+        else:
+            # Default: assume has ground truth
+            base_stats.update({
+                "correct_answers": 0,
+                "incorrect_answers": 0,
+            })
+        
+        return base_stats
+
+    def scan_log_files(self) -> List[Path]:
+        """Scan for all task log files"""
+        if not self.log_folder.exists():
+            return []
+        return sorted(
+            self.log_folder.glob("task_*_attempt_*.json"),
+            key=lambda x: x.stat().st_mtime,
+            reverse=True,
+        )
+
+    def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
+        """Parse a single task log file"""
+        try:
+            with open(file_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, FileNotFoundError, KeyError):
+            return None
+
+    def extract_task_info(
+        self, data: Dict[str, Any], file_path: Path
+    ) -> Dict[str, Any]:
+        """Extract relevant information from task data"""
+        task_id = data.get("task_id", "unknown")
+        status = data.get("status", "unknown").lower()
+        judge_result = data.get("judge_result", "").upper()
+        final_answer = data.get("final_boxed_answer", "")
+        error_msg = data.get("error", "")
+        
+        # Extract attempt number from filename (e.g., task_xxx_attempt_1.json -> 1)
+        attempt = 1  # Default
+        match = re.search(r"_attempt_(\d+)\.json$", str(file_path))
+        if match:
+            attempt = int(match.group(1))
+
+        # Extract execution time
+        start_time = data.get("start_time")
+        end_time = data.get("end_time")
+        execution_time = None
+
+        if start_time and end_time:
+            try:
+                start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
+                end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
+                execution_time = (end_dt - start_dt).total_seconds()
+            except Exception:
+                pass
+
+        # Extract task type from metadata or task_id
+        task_type = ""
+        metadata = data.get("metadata", {})
+        if isinstance(metadata, dict):
+            # Try to get task type from various metadata fields
+            if "Level" in metadata:
+                task_type = f"Level {metadata['Level']}"
+            elif "task_type" in metadata:
+                task_type = str(metadata["task_type"])
+            elif "type" in metadata:
+                task_type = str(metadata["type"])
+            elif "difficulty" in metadata:
+                task_type = f"Difficulty {metadata['difficulty']}"
+        
+        # For FinSearchComp, extract task type from task_id (e.g., "(T1)Time_Sensitive_Data_Fetching_006")
+        if self.benchmark_type == "finsearchcomp" and not task_type:
+            match = re.match(r"^\(T(\d+)\)", task_id)
+            if match:
+                task_type = f"T{match.group(1)}"
+        
+        # Extract region for FinSearchComp
+        region = ""
+        if self.benchmark_type == "finsearchcomp":
+            label = data.get("input", {}).get("metadata", {}).get("label", "")
+            if "(Global)" in label:
+                region = "Global"
+            elif "(Greater China)" in label:
+                region = "Greater China"
+
+        return {
+            "task_id": task_id,
+            "file_path": str(file_path),
+            "status": status,
+            "judge_result": judge_result,
+            "final_answer": final_answer,
+            "error": error_msg,
+            "execution_time": execution_time,
+            "task_type": task_type,
+            "region": region,
+            "attempt": attempt,
+            "last_modified": file_path.stat().st_mtime,
+        }
+
+    def update_statistics(self, task_info: Dict[str, Any]):
+        """Update monitoring statistics based on benchmark type"""
+        task_id = task_info["task_id"]
+        status = task_info["status"]
+        judge_result = task_info["judge_result"]
+        execution_time = task_info["execution_time"]
+        final_answer = task_info.get("final_answer", "")
+        error_msg = task_info.get("error", "")
+        task_type = task_info.get("task_type", "")
+
+        # Update task tracking
+        if task_id not in self.tasks:
+            self.tasks[task_id] = task_info
+            self.stats["total_tasks"] += 1
+            region = task_info.get("region", "")
+            self._update_stats_for_new_task(status, judge_result, final_answer, error_msg, task_type, region)
+        else:
+            # Update existing task - only update if status changed
+            old_status = self.tasks[task_id]["status"]
+            if old_status != status:
+                self.recent_activity.append(
+                    {
+                        "task_id": task_id,
+                        "old_status": old_status,
+                        "new_status": status,
+                        "timestamp": datetime.now(),
+                    }
+                )
+                old_region = self.tasks[task_id].get("region", "")
+                new_region = task_info.get("region", "")
+                self._update_stats_for_status_change(
+                    old_status, status, 
+                    self.tasks[task_id].get("judge_result", ""),
+                    judge_result,
+                    self.tasks[task_id].get("final_answer", ""),
+                    final_answer,
+                    self.tasks[task_id].get("error", ""),
+                    error_msg,
+                    task_type,
+                    old_region,
+                    new_region
+                )
+            self.tasks[task_id] = task_info
+
+        # Track execution times
+        if execution_time is not None:
+            self.stats["execution_times"].append(execution_time)
+            if len(self.stats["execution_times"]) > 100:
+                self.stats["execution_times"] = self.stats["execution_times"][-100:]
+
+    def _update_stats_for_new_task(self, status: str, judge_result: str, 
+                                   final_answer: str, error_msg: str, task_type: str, region: str = ""):
+        """Update statistics for a new task based on benchmark type (like check_finsearchcomp_progress.py)"""
+        if status == "completed":
+            self.stats["completed_tasks"] += 1
+            
+            if self.benchmark_type == "gaia":
+                if judge_result == "CORRECT":
+                    self.stats["correct_answers"] += 1
+                elif judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] += 1
+            elif self.benchmark_type in ["futurex", "xbench"]:
+                # For xbench/futurex: count predictions for all tasks (like check_xbench_progress.py)
+                # But prediction_rate is calculated as with_predictions / completed
+                pass  # Predictions and errors are counted below for all statuses
+            elif self.benchmark_type == "finsearchcomp":
+                if task_type in ["T1", "T2", "T3", "Unknown"]:
+                    self.stats["task_type_breakdown"][task_type]["completed"] += 1
+                
+                # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py)
+                # T1 tasks are considered "completed" but not evaluated for correctness due to outdated ground truth
+                if task_type == "T1":
+                    pass  # T1 tasks are excluded from correctness evaluation
+                elif task_type in ["T2", "T3"]:
+                    # For T2 and T3 tasks, evaluate correctness (like check_finsearchcomp_progress.py)
+                    # If judge_result is CORRECT, count as correct; otherwise (including NOT_ATTEMPTED) count as incorrect
+                    if judge_result == "CORRECT":
+                        self.stats["correct_answers"] += 1
+                        self.stats["task_type_breakdown"][task_type]["correct"] += 1
+                        # Update regional breakdown for correct T2 and T3 tasks
+                        if region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][region][task_type]["correct"] += 1
+                    else:
+                        # All non-CORRECT results (including NOT_ATTEMPTED, INCORRECT, ERROR) count as incorrect
+                        self.stats["incorrect_answers"] += 1
+                        self.stats["task_type_breakdown"][task_type]["incorrect"] += 1
+                        # Update regional breakdown for incorrect T2 and T3 tasks
+                        if region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][region][task_type]["incorrect"] += 1
+            else:  # default
+                if judge_result == "CORRECT":
+                    self.stats["correct_answers"] += 1
+                elif judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] += 1
+        elif status == "running":
+            self.stats["running_tasks"] += 1
+        elif status in ["failed", "error", "interrupted"]:
+            self.stats["failed_tasks"] += 1
+        
+        # For xbench/futurex: count predictions and errors for ALL tasks (like check_xbench_progress.py)
+        if self.benchmark_type in ["futurex", "xbench"]:
+            if final_answer and final_answer.strip():
+                self.stats["with_predictions"] += 1
+            else:
+                self.stats["without_predictions"] += 1
+            if error_msg and error_msg.strip():
+                self.stats["with_errors"] += 1
+        
+        # Update task type breakdown for FinSearchComp
+        if self.benchmark_type == "finsearchcomp" and task_type:
+            if task_type in ["T1", "T2", "T3", "Unknown"]:
+                self.stats["task_type_breakdown"][task_type]["total"] += 1
+                # Update regional breakdown for T2 and T3 tasks
+                if task_type in ["T2", "T3"] and region in ["Global", "Greater China"]:
+                    self.stats["regional_breakdown"][region][task_type]["total"] += 1
+                    if status == "completed":
+                        self.stats["regional_breakdown"][region][task_type]["completed"] += 1
+
+    def _update_stats_for_status_change(self, old_status: str, new_status: str,
+                                        old_judge_result: str, new_judge_result: str,
+                                        old_final_answer: str, new_final_answer: str,
+                                        old_error: str, new_error: str,
+                                        task_type: str, old_region: str = "", new_region: str = ""):
+        """Update statistics when task status changes"""
+        # Decrease old status count
+        if old_status == "completed":
+            self.stats["completed_tasks"] -= 1
+            if self.benchmark_type == "gaia":
+                if old_judge_result == "CORRECT":
+                    self.stats["correct_answers"] -= 1
+                elif old_judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] -= 1
+            elif self.benchmark_type in ["futurex", "xbench"]:
+                # Predictions and errors are updated below for all statuses
+                pass
+            elif self.benchmark_type == "finsearchcomp":
+                if task_type in ["T1", "T2", "T3", "Unknown"]:
+                    self.stats["task_type_breakdown"][task_type]["completed"] -= 1
+                # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py)
+                if task_type == "T1":
+                    pass  # T1 tasks are excluded from correctness evaluation
+                elif task_type in ["T2", "T3"]:
+                    # Like check_finsearchcomp_progress.py: if CORRECT, count as correct; otherwise as incorrect
+                    if old_judge_result == "CORRECT":
+                        self.stats["correct_answers"] -= 1
+                        self.stats["task_type_breakdown"][task_type]["correct"] -= 1
+                        # Update regional breakdown for correct T2 and T3 tasks
+                        if old_region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][old_region][task_type]["correct"] -= 1
+                    else:
+                        # All non-CORRECT results count as incorrect
+                        self.stats["incorrect_answers"] -= 1
+                        self.stats["task_type_breakdown"][task_type]["incorrect"] -= 1
+                        # Update regional breakdown for incorrect T2 and T3 tasks
+                        if old_region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][old_region][task_type]["incorrect"] -= 1
+                    # Update regional breakdown for completed T2 and T3 tasks
+                    if old_region in ["Global", "Greater China"]:
+                        self.stats["regional_breakdown"][old_region][task_type]["completed"] -= 1
+            else:  # default
+                if old_judge_result == "CORRECT":
+                    self.stats["correct_answers"] -= 1
+                elif old_judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] -= 1
+        elif old_status == "running":
+            self.stats["running_tasks"] -= 1
+        elif old_status in ["failed", "error", "interrupted"]:
+            self.stats["failed_tasks"] -= 1
+
+        # Increase new status count
+        if new_status == "completed":
+            self.stats["completed_tasks"] += 1
+            if self.benchmark_type == "gaia":
+                if new_judge_result == "CORRECT":
+                    self.stats["correct_answers"] += 1
+                elif new_judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] += 1
+            elif self.benchmark_type in ["futurex", "xbench"]:
+                # Predictions and errors are updated below for all statuses
+                pass
+            elif self.benchmark_type == "finsearchcomp":
+                if task_type in ["T1", "T2", "T3", "Unknown"]:
+                    self.stats["task_type_breakdown"][task_type]["completed"] += 1
+                
+                # For T1 tasks, exclude from correctness evaluation (like check_finsearchcomp_progress.py)
+                # T1 tasks are considered "completed" but not evaluated for correctness due to outdated ground truth
+                if task_type == "T1":
+                    pass  # T1 tasks are excluded from correctness evaluation
+                elif task_type in ["T2", "T3"]:
+                    # For T2 and T3 tasks, evaluate correctness (like check_finsearchcomp_progress.py)
+                    # If judge_result is CORRECT, count as correct; otherwise (including NOT_ATTEMPTED) count as incorrect
+                    if new_judge_result == "CORRECT":
+                        self.stats["correct_answers"] += 1
+                        self.stats["task_type_breakdown"][task_type]["correct"] += 1
+                        # Update regional breakdown for correct T2 and T3 tasks
+                        if new_region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][new_region][task_type]["correct"] += 1
+                    else:
+                        # All non-CORRECT results (including NOT_ATTEMPTED, INCORRECT, ERROR) count as incorrect
+                        self.stats["incorrect_answers"] += 1
+                        self.stats["task_type_breakdown"][task_type]["incorrect"] += 1
+                        # Update regional breakdown for incorrect T2 and T3 tasks
+                        if new_region in ["Global", "Greater China"]:
+                            self.stats["regional_breakdown"][new_region][task_type]["incorrect"] += 1
+                    # Update regional breakdown for completed T2 and T3 tasks
+                    if new_region in ["Global", "Greater China"]:
+                        self.stats["regional_breakdown"][new_region][task_type]["completed"] += 1
+            else:  # default
+                if new_judge_result == "CORRECT":
+                    self.stats["correct_answers"] += 1
+                elif new_judge_result in ["INCORRECT", "ERROR"]:
+                    self.stats["incorrect_answers"] += 1
+        elif new_status == "running":
+            self.stats["running_tasks"] += 1
+        elif new_status in ["failed", "error", "interrupted"]:
+            self.stats["failed_tasks"] += 1
+        
+        # For xbench/futurex: update predictions and errors for ALL statuses (like check_xbench_progress.py)
+        if self.benchmark_type in ["futurex", "xbench"]:
+            # Decrease old counts
+            if old_final_answer and old_final_answer.strip():
+                self.stats["with_predictions"] -= 1
+            else:
+                self.stats["without_predictions"] -= 1
+            if old_error and old_error.strip():
+                self.stats["with_errors"] -= 1
+            
+            # Increase new counts
+            if new_final_answer and new_final_answer.strip():
+                self.stats["with_predictions"] += 1
+            else:
+                self.stats["without_predictions"] += 1
+            if new_error and new_error.strip():
+                self.stats["with_errors"] += 1
+
+    def get_status_json(self) -> Dict[str, Any]:
+        """Get current status as JSON for web interface, based on benchmark type"""
+        total = self.stats["total_tasks"]
+        completed = self.stats["completed_tasks"]
+        running = self.stats["running_tasks"]
+        failed = self.stats["failed_tasks"]
+
+        progress_pct = (completed / total * 100) if total > 0 else 0
+        progress_pct = min(progress_pct, 100.0)  # Cap at 100%
+
+        exec_times = self.stats["execution_times"]
+        avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0
+
+        elapsed_time = (datetime.now() - self.start_time).total_seconds()
+        tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0
+
+        result = {
+            "total_tasks": total,
+            "completed_tasks": completed,
+            "running_tasks": running,
+            "failed_tasks": failed,
+            "progress_pct": progress_pct,
+            "avg_execution_time": avg_execution_time,
+            "tasks_per_second": tasks_per_second,
+            "benchmark_type": self.benchmark_type,
+            "last_update": self.stats["last_update"].isoformat()
+            if self.stats["last_update"]
+            else None,
+        }
+
+        # Add type-specific metrics
+        if self.benchmark_type == "gaia":
+            total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"]
+            accuracy = (
+                (self.stats["correct_answers"] / total_judged * 100)
+                if total_judged > 0
+                else 0
+            )
+            result.update({
+                "correct_answers": self.stats["correct_answers"],
+                "incorrect_answers": self.stats["incorrect_answers"],
+                "accuracy": accuracy,
+            })
+        elif self.benchmark_type in ["futurex", "xbench"]:
+            prediction_rate = (
+                (self.stats["with_predictions"] / completed * 100)
+                if completed > 0
+                else 0
+            )
+            result.update({
+                "with_predictions": self.stats["with_predictions"],
+                "without_predictions": self.stats["without_predictions"],
+                "with_errors": self.stats["with_errors"],
+                "prediction_rate": prediction_rate,
+            })
+        elif self.benchmark_type == "finsearchcomp":
+            t2_t3_completed = (
+                self.stats["task_type_breakdown"]["T2"]["completed"]
+                + self.stats["task_type_breakdown"]["T3"]["completed"]
+            )
+            t2_t3_correct = (
+                self.stats["task_type_breakdown"]["T2"]["correct"]
+                + self.stats["task_type_breakdown"]["T3"]["correct"]
+            )
+            accuracy = (
+                (t2_t3_correct / t2_t3_completed * 100)
+                if t2_t3_completed > 0
+                else 0
+            )
+            result.update({
+                "correct_answers": self.stats["correct_answers"],  # T2+T3 only
+                "incorrect_answers": self.stats["incorrect_answers"],  # T2+T3 only
+                "accuracy": accuracy,  # T2+T3 accuracy
+                "task_type_breakdown": self.stats["task_type_breakdown"],
+                "regional_breakdown": self.stats["regional_breakdown"],  # Like check_finsearchcomp_progress.py
+                "t1_completed": self.stats["task_type_breakdown"]["T1"]["completed"],
+            })
+        else:  # default
+            total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"]
+            accuracy = (
+                (self.stats["correct_answers"] / total_judged * 100)
+                if total_judged > 0
+                else 0
+            )
+            result.update({
+                "correct_answers": self.stats["correct_answers"],
+                "incorrect_answers": self.stats["incorrect_answers"],
+                "accuracy": accuracy,
+            })
+
+        return result
+
+    def get_tasks_json(self) -> List[Dict[str, Any]]:
+        """Get tasks list as JSON for web interface"""
+        tasks_list = []
+        for task_info in sorted(
+            self.tasks.values(), key=lambda x: x["last_modified"], reverse=True
+        ):
+            # For FutureX/xbench, don't include judge_result (like check_futurex_progress.py, check_xbench_progress.py)
+            task_dict = {
+                "task_id": task_info["task_id"],
+                "status": task_info["status"],
+                "task_type": task_info["task_type"],
+                "execution_time": task_info["execution_time"],
+            }
+            
+            # Exclude judge_result for FutureX and xbench (like check_futurex_progress.py, check_xbench_progress.py)
+            if self.benchmark_type not in ["futurex", "xbench"]:
+                task_dict["judge_result"] = task_info["judge_result"]
+            else:
+                # For FutureX/xbench, include final_answer instead (for display purposes)
+                task_dict["final_answer"] = task_info.get("final_answer", "")
+            
+            tasks_list.append(task_dict)
+        
+        return tasks_list
+
+    def scan_and_update(self):
+        """Scan log files and update statistics"""
+        log_files = self.scan_log_files()
+
+        for file_path in log_files:
+            data = self.parse_task_file(file_path)
+            if data:
+                task_info = self.extract_task_info(data, file_path)
+                self.update_statistics(task_info)
+
+        self.stats["last_update"] = datetime.now()
+
+    def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Get information about a specific task"""
+        return self.tasks.get(task_id)
+
+    def generate_task_report(self, task_id: str) -> Optional[str]:
+        """Generate report by calling the standalone report generator"""
+        try:
+            # Get task info to extract attempt number
+            task_info = self.get_task_info(task_id)
+            if not task_info:
+                return f"Error: Task {task_id} not found"
+            
+            attempt = task_info.get("attempt", 1)
+            
+            # Import the report generator module
+            import importlib.util
+            report_generator_path = os.path.join(
+                os.path.dirname(__file__), "generate_benchmark_report.py"
+            )
+            
+            spec = importlib.util.spec_from_file_location(
+                "generate_benchmark_report",
+                report_generator_path,
+            )
+            if spec is None or spec.loader is None:
+                return f"Error: Could not load report generator module"
+            
+            report_module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(report_module)
+            
+            # Call the report generator
+            report_path = report_module.generate_task_report_from_log(
+                log_dir=str(self.log_folder),
+                task_id=task_id,
+                attempt=attempt,
+                output_dir=None,  # Use default output directory
+            )
+            
+            if report_path and os.path.exists(report_path):
+                # Read and return the generated report
+                with open(report_path, "r", encoding="utf-8") as f:
+                    return f.read()
+            
+            return f"Error: Failed to generate report for task {task_id}"
+            
+        except Exception as e:
+            return f"Error generating report for task {task_id}: {str(e)}"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark Monitor with Web Interface")
+    parser.add_argument("log_folder", nargs="?", default=".", help="Path to benchmark log folder")
+    parser.add_argument("--web-port", type=int, default=8080, help="Web interface port")
+
+    args = parser.parse_args()
+
+    if not Path(args.log_folder).exists():
+        print(f"Error: Log folder not found: {args.log_folder}")
+        return 1
+
+    # Create monitor
+    monitor = BenchmarkMonitor(args.log_folder)
+
+    # Start web dashboard
+    dashboard = WebDashboard(monitor, args.web_port)
+    dashboard.start_server()
+
+    print("Benchmark Monitor started")
+    print(f"Monitoring logs in: {args.log_folder}")
+    print(f"Web dashboard: http://localhost:{dashboard.port}")
+    print("Press Ctrl+C to stop")
+
+    try:
+        while True:
+            monitor.scan_and_update()
+            time.sleep(30)  # Update every 30 seconds
+    except KeyboardInterrupt:
+        print("\nMonitor stopped by user")
+
+    return 0
+
+
+if __name__ == "__main__":
+    exit(main())
diff --git a/utils/progress_check/generate_benchmark_report.py b/utils/progress_check/generate_benchmark_report.py
new file mode 100644
index 0000000..ffff25c
--- /dev/null
+++ b/utils/progress_check/generate_benchmark_report.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Generic Benchmark Task Report Generator
+
+This script generates detailed text reports for tasks from benchmark log files.
+Works with any benchmark dataset (GAIA, FinSearchComp, FutureX, etc.)
+"""
+
+import json
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Dict, Any
+
+
+def find_task_log_file(log_dir: str, task_id: str, attempt: int = 1) -> Optional[Path]:
+    """Find task log file in the log directory"""
+    log_path = Path(log_dir)
+    if not log_path.exists():
+        return None
+    
+    # Try to find the log file
+    pattern = f"task_{task_id}_attempt_{attempt}.json"
+    log_file = log_path / pattern
+    
+    if log_file.exists():
+        return log_file
+    
+    # Try without attempt number
+    pattern = f"task_{task_id}.json"
+    log_file = log_path / pattern
+    if log_file.exists():
+        return log_file
+    
+    return None
+
+
+def load_task_from_log(log_file: Path) -> Optional[Dict[str, Any]]:
+    """Load task data from log file"""
+    try:
+        with open(log_file, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (json.JSONDecodeError, FileNotFoundError):
+        return None
+
+
+def extract_question(log_data: Dict[str, Any]) -> str:
+    """Extract question from log data in various formats"""
+    # Try different possible locations
+    if "task_question" in log_data:
+        return log_data["task_question"]
+    
+    if "input" in log_data:
+        input_data = log_data["input"]
+        if isinstance(input_data, dict):
+            if "task_description" in input_data:
+                return input_data["task_description"]
+            elif "task_question" in input_data:
+                return input_data["task_question"]
+        elif isinstance(input_data, str):
+            return input_data
+    
+    return "N/A"
+
+
+def extract_metadata_info(log_data: Dict[str, Any]) -> Dict[str, Any]:
+    """Extract metadata information from log data"""
+    metadata_info = {}
+    
+    # Try to get metadata from various locations
+    metadata = log_data.get("metadata", {})
+    if isinstance(metadata, dict):
+        metadata_info.update(metadata)
+    
+    # Also check input.metadata
+    if "input" in log_data and isinstance(log_data["input"], dict):
+        input_metadata = log_data["input"].get("metadata", {})
+        if isinstance(input_metadata, dict):
+            metadata_info.update(input_metadata)
+    
+    return metadata_info
+
+
+def generate_task_report_from_log(
+    log_dir: str, 
+    task_id: str, 
+    attempt: int = 1,
+    output_dir: Optional[str] = None
+) -> Optional[str]:
+    """Generate detailed text report from task log file"""
+    
+    # Find the log file
+    log_file = find_task_log_file(log_dir, task_id, attempt)
+    if not log_file:
+        print(f"❌ Error: Log file not found for task {task_id} (attempt {attempt})")
+        return None
+    
+    # Load task data
+    log_data = load_task_from_log(log_file)
+    if not log_data:
+        print(f"❌ Error: Failed to load log file: {log_file}")
+        return None
+    
+    # Set output directory (default to log_dir/reports)
+    if output_dir is None:
+        output_dir = os.path.join(log_dir, "reports")
+    
+    # Ensure the directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Generate report file
+    report_filename = f"task_{task_id}_report.txt"
+    report_path = os.path.join(output_dir, report_filename)
+    
+    # Extract information
+    question = extract_question(log_data)
+    ground_truth = log_data.get("ground_truth", "N/A")
+    final_answer = log_data.get("final_boxed_answer", log_data.get("final_answer", "N/A"))
+    status = log_data.get("status", "unknown")
+    judge_result = log_data.get("judge_result", "N/A")
+    error = log_data.get("error", "")
+    
+    # Extract execution time
+    execution_time = None
+    start_time = log_data.get("start_time")
+    end_time = log_data.get("end_time")
+    if start_time and end_time:
+        try:
+            from datetime import datetime
+            start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
+            end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
+            execution_time = (end_dt - start_dt).total_seconds()
+        except Exception:
+            pass
+    
+    # Extract metadata
+    metadata_info = extract_metadata_info(log_data)
+    
+    # Generate report
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write("=" * 80 + "\n")
+        f.write(f"Benchmark Task Report: {task_id}\n")
+        f.write("=" * 80 + "\n\n")
+        
+        # Basic information
+        f.write("1. Task Basic Information\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"Task ID: {task_id}\n")
+        f.write(f"Status: {status}\n")
+        f.write(f"Judge Result: {judge_result}\n")
+        if execution_time:
+            f.write(f"Execution Time: {execution_time:.2f} seconds\n")
+        if log_data.get("task_file_name"):
+            f.write(f"File Attachment: {log_data['task_file_name']}\n")
+        f.write("\n\n")
+        
+        # Question content
+        f.write("2. Question Content\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{question}\n\n\n")
+        
+        # Ground truth answer
+        f.write("3. Ground Truth Answer\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{ground_truth}\n\n\n")
+        
+        # Model answer
+        f.write("4. Model Answer\n")
+        f.write("-" * 40 + "\n")
+        f.write(f"{final_answer}\n\n\n")
+        
+        # Error information (if any)
+        if error:
+            f.write("5. Error Information\n")
+            f.write("-" * 40 + "\n")
+            f.write(f"{error}\n\n\n")
+        
+        # Metadata (if available)
+        if metadata_info:
+            f.write("6. Task Metadata\n")
+            f.write("-" * 40 + "\n")
+            for key, value in metadata_info.items():
+                if isinstance(value, dict):
+                    f.write(f"{key}:\n")
+                    for sub_key, sub_value in value.items():
+                        f.write(f"  {sub_key}: {sub_value}\n")
+                elif isinstance(value, list):
+                    f.write(f"{key}: {', '.join(map(str, value))}\n")
+                else:
+                    f.write(f"{key}: {value}\n")
+            f.write("\n\n")
+        
+        # Execution steps (if available)
+        if "step_logs" in log_data and log_data["step_logs"]:
+            f.write("7. Execution Steps\n")
+            f.write("-" * 40 + "\n")
+            f.write(f"Total steps: {len(log_data['step_logs'])}\n")
+            # Optionally include step details
+            f.write("\n")
+        
+        f.write("=" * 80 + "\n")
+        f.write("End of Report\n")
+        f.write("=" * 80 + "\n")
+    
+    print(f"📄 Task {task_id} report saved to: {report_path}")
+    return report_path
+
+
+def main():
+    """Main function"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="Generate benchmark task reports from log files")
+    parser.add_argument(
+        "log_dir",
+        type=str,
+        help="Path to benchmark log directory",
+    )
+    parser.add_argument(
+        "task_id",
+        type=str,
+        help="Task ID to generate report for",
+    )
+    parser.add_argument(
+        "--attempt",
+        type=int,
+        default=1,
+        help="Attempt number (default: 1)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Output directory for reports (default: <log_dir>/reports)",
+    )
+    
+    args = parser.parse_args()
+    
+    generate_task_report_from_log(
+        args.log_dir,
+        args.task_id,
+        args.attempt,
+        args.output_dir
+    )
+
+
+if __name__ == "__main__":
+    main()
+

From 81b32bc4d38db8e7ffa22e5734f577dfe9c7e1dc Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Mon, 10 Nov 2025 22:34:23 +0000
Subject: [PATCH 09/11] update main.py

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index b7260e5..f0bb018 100644
--- a/main.py
+++ b/main.py
@@ -5,7 +5,7 @@
 import utils.calculate_average_score
 import utils.calculate_score_from_log
 import common_benchmark
-import run_gaia_with_monitor
+import run_benchmark_with_monitor
 import dotenv
 import utils.eval_answer_from_log
 import fire
@@ -36,7 +36,7 @@ def print_config(*args):
             "print-config": print_config,
             "trace": utils.trace_single_task.main,
             "common-benchmark": common_benchmark.main,
-            "run-gaia-with-monitor": run_gaia_with_monitor.main,
+            "run-benchmark-with-monitor": run_benchmark_with_monitor.main,
             "eval-answer": utils.eval_answer_from_log.main,
             "avg-score": utils.calculate_average_score.main,
             "score-from-log": utils.calculate_score_from_log.main,

From 787454cff2b1ba5cce7719958a2b538df9e921c5 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Mon, 10 Nov 2025 22:36:57 +0000
Subject: [PATCH 10/11] delete deprecated files

---
 run_gaia_with_monitor.py                     | 130 -----
 utils/progress_check/gaia_web_monitor.py     | 558 -------------------
 utils/progress_check/generate_gaia_report.py | 185 ------
 3 files changed, 873 deletions(-)
 delete mode 100644 run_gaia_with_monitor.py
 delete mode 100644 utils/progress_check/gaia_web_monitor.py
 delete mode 100644 utils/progress_check/generate_gaia_report.py

diff --git a/run_gaia_with_monitor.py b/run_gaia_with_monitor.py
deleted file mode 100644
index 694fcb4..0000000
--- a/run_gaia_with_monitor.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# SPDX-FileCopyrightText: 2025 MiromindAI
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import subprocess
-import signal
-import sys
-import time
-from typing import Optional
-
-
-def main(*args, config_file_name: str = "", output_dir: str = "", web_port: int = 8080):
-    """Run benchmark with integrated web monitoring"""
-
-    # Validate required arguments
-    if not output_dir:
-        print("Error: output_dir is required")
-        print(
-            "Usage: uv run main.py run-gaia-with-monitor --output_dir=path --config_file_name=name"
-        )
-        return 1
-
-    # Create output directory if it doesn't exist
-    os.makedirs(output_dir, exist_ok=True)
-
-    print("=" * 50)
-    print("Benchmark Runner with Monitor")
-    print("=" * 50)
-    print(f"Output directory: {output_dir}")
-    print(f"Config name: {config_file_name}")
-    print(f"Web port: {web_port}")
-    print("=" * 50)
-
-    # Global variables for process management
-    benchmark_process: Optional[subprocess.Popen] = None
-    monitor_process: Optional[subprocess.Popen] = None
-
-    def cleanup_processes():
-        """Clean up running processes"""
-        print("\nShutting down processes...")
-
-        if benchmark_process and benchmark_process.poll() is None:
-            print(f"Stopping benchmark (PID: {benchmark_process.pid})...")
-            benchmark_process.terminate()
-            try:
-                benchmark_process.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                benchmark_process.kill()
-
-        if monitor_process and monitor_process.poll() is None:
-            print(f"Stopping monitor (PID: {monitor_process.pid})...")
-            monitor_process.terminate()
-            try:
-                monitor_process.wait(timeout=5)
-            except subprocess.TimeoutExpired:
-                monitor_process.kill()
-
-        print("Cleanup complete.")
-
-    def signal_handler(signum, frame):
-        """Handle Ctrl+C gracefully"""
-        cleanup_processes()
-        sys.exit(0)
-
-    # Set up signal handlers
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        # Start benchmark
-        print("Starting benchmark...")
-        benchmark_cmd = [
-            "uv",
-            "run",
-            "main.py",
-            "common-benchmark",
-            f"--config_file_name={config_file_name}",
-            f"output_dir={output_dir}",
-        ]
-        benchmark_process = subprocess.Popen(benchmark_cmd)
-        print(f"Benchmark started with PID: {benchmark_process.pid}")
-
-        # Wait a moment for benchmark to initialize
-        time.sleep(3)
-
-        # Start monitor
-        print("Starting web monitor...")
-        monitor_cmd = [
-            "uv",
-            "run",
-            "utils/progress_check/gaia_web_monitor.py",
-            output_dir,
-            f"--web-port={web_port}",
-        ]
-        monitor_process = subprocess.Popen(monitor_cmd)
-        print(f"Monitor started with PID: {monitor_process.pid}")
-        print(f"Web dashboard available at: http://localhost:{web_port}")
-
-        print("\n" + "=" * 50)
-        print("Both processes are running!")
-        print("Press Ctrl+C to stop both processes")
-        print("Monitor will continue running even if benchmark finishes")
-        print("=" * 50)
-
-        # Monitor the processes
-        while True:
-            time.sleep(5)
-
-            # Check if benchmark process is still running
-            if benchmark_process and benchmark_process.poll() is not None:
-                print("Benchmark process ended")
-                benchmark_process = None
-
-            # Check if monitor process is still running
-            if monitor_process and monitor_process.poll() is not None:
-                print("Monitor process died unexpectedly. Restarting...")
-                monitor_process = subprocess.Popen(monitor_cmd)
-                print(f"Monitor restarted with PID: {monitor_process.pid}")
-
-    except KeyboardInterrupt:
-        cleanup_processes()
-
-    return 0
-
-
-if __name__ == "__main__":
-    import fire
-
-    fire.Fire(main)
diff --git a/utils/progress_check/gaia_web_monitor.py b/utils/progress_check/gaia_web_monitor.py
deleted file mode 100644
index 6e43c3b..0000000
--- a/utils/progress_check/gaia_web_monitor.py
+++ /dev/null
@@ -1,558 +0,0 @@
-"""
-GAIA Benchmark Monitor with Web Interface
-
-This script provides monitoring capabilities including:
-- Real-time web dashboard
-- Historical data tracking
-
-Usage:
-    uv run utils/progress_check/gaia_web_monitor.py [LOG_FOLDER_PATH] [OPTIONS]
-
-Options:
-    --web-port PORT       Web interface port (default: 8080)
-"""
-
-import json
-import time
-import argparse
-from pathlib import Path
-from typing import Dict, List, Any, Optional
-from datetime import datetime
-import threading
-import os
-from http.server import HTTPServer, BaseHTTPRequestHandler
-
-
-class WebDashboard:
-    """Simple web dashboard for monitoring"""
-
-    def __init__(self, monitor, port: int = 8080):
-        self.monitor = monitor
-        self.port = port
-        self.server = None
-
-    def start_server(self):
-        """Start the web server"""
-        handler = self.create_handler()
-        self.server = HTTPServer(("localhost", self.port), handler)
-        print(f"Web dashboard available at: http://localhost:{self.port}")
-
-        def run_server():
-            self.server.serve_forever()
-
-        thread = threading.Thread(target=run_server, daemon=True)
-        thread.start()
-
-    def create_handler(self):
-        """Create HTTP request handler"""
-        monitor = self.monitor
-
-        class DashboardHandler(BaseHTTPRequestHandler):
-            def do_GET(self):
-                if self.path == "/":
-                    self.send_dashboard()
-                elif self.path == "/api/status":
-                    self.send_json(monitor.get_status_json())
-                elif self.path == "/api/tasks":
-                    self.send_json(monitor.get_tasks_json())
-                elif self.path.startswith("/api/task-report/"):
-                    task_id = self.path.split("/")[-1]
-                    self.send_task_report(task_id)
-                else:
-                    self.send_error(404)
-
-            def send_dashboard(self):
-                self.send_response(200)
-                self.send_header("Content-type", "text/html")
-                self.end_headers()
-
-                html = self.generate_dashboard_html()
-                self.wfile.write(html.encode())
-
-            def send_json(self, data):
-                self.send_response(200)
-                self.send_header("Content-type", "application/json")
-                self.end_headers()
-                self.wfile.write(json.dumps(data, default=str).encode())
-
-            def send_task_report(self, task_id):
-                """Send task report for a specific task"""
-                try:
-                    # Try to find the task in the current running tasks
-                    task_info = monitor.get_task_info(task_id)
-                    if not task_info:
-                        self.send_error(404, "Task not found")
-                        return
-
-                    # Generate report using the generate_gaia_report script
-                    report_content = monitor.generate_task_report(task_id)
-                    if not report_content:
-                        self.send_error(500, "Failed to generate report")
-                        return
-
-                    self.send_response(200)
-                    self.send_header("Content-type", "text/plain; charset=utf-8")
-                    self.end_headers()
-                    self.wfile.write(report_content.encode("utf-8"))
-
-                except Exception as e:
-                    self.send_error(500, f"Error generating report: {str(e)}")
-
-            def generate_dashboard_html(self):
-                return """
-<!DOCTYPE html>
-<html>
-<head>
-    <title>Benchmark Monitor Dashboard</title>
-    <meta charset="utf-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <style>
-        body { font-family: Arial, sans-serif; margin: 20px; background-color: #f5f5f5; }
-        .container { max-width: 1200px; margin: 0 auto; }
-        .card { background: white; padding: 20px; margin: 10px 0; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
-        .metric { display: inline-block; margin: 10px; padding: 15px; background: #e3f2fd; border-radius: 5px; text-align: center; }
-        .metric-value { font-size: 24px; font-weight: bold; color: #1976d2; }
-        .metric-label { font-size: 14px; color: #666; }
-        .progress-bar { width: 100%; height: 20px; background: #e0e0e0; border-radius: 10px; overflow: hidden; }
-        .progress-fill { height: 100%; background: linear-gradient(90deg, #4caf50, #8bc34a); transition: width 0.3s; }
-        .status-running { color: #ff9800; }
-        .status-completed { color: #4caf50; }
-        .status-failed { color: #f44336; }
-        .task-list { max-height: 400px; overflow-y: auto; }
-        .task-item { padding: 8px; border-bottom: 1px solid #eee; }
-        .refresh-btn { background: #2196f3; color: white; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; }
-        .refresh-btn:hover { background: #1976d2; }
-        .view-report-btn { background: #4caf50; color: white; border: none; padding: 5px 10px; border-radius: 3px; cursor: pointer; margin-left: 10px; font-size: 12px; }
-        .view-report-btn:hover { background: #45a049; }
-    </style>
-    <script>
-        function refreshData() {
-            fetch('/api/status')
-                .then(response => response.json())
-                .then(data => updateDashboard(data));
-            
-            fetch('/api/tasks')
-                .then(response => response.json())
-                .then(data => updateTaskList(data));
-        }
-        
-        function updateDashboard(data) {
-            document.getElementById('progress-pct').textContent = data.progress_pct.toFixed(1) + '%';
-            document.getElementById('progress-fill').style.width = data.progress_pct + '%';
-            document.getElementById('total-tasks').textContent = data.total_tasks;
-            document.getElementById('completed-tasks').textContent = data.completed_tasks;
-            document.getElementById('running-tasks').textContent = data.running_tasks;
-            document.getElementById('failed-tasks').textContent = data.failed_tasks;
-            document.getElementById('accuracy').textContent = data.accuracy.toFixed(1) + '%';
-        }
-        
-        function updateTaskList(tasks) {
-            const container = document.getElementById('task-list');
-            container.innerHTML = '';
-            tasks.forEach(task => {
-                const div = document.createElement('div');
-                div.className = 'task-item';
-                const taskTypeDisplay = task.task_type ? `<small>${task.task_type}</small>` : '';
-                div.innerHTML = `
-                    <strong>${task.task_id}</strong> - 
-                    <span class="status-${task.status}">${task.status}</span> - 
-                    ${task.judge_result}${taskTypeDisplay ? ' - ' + taskTypeDisplay : ''}
-                    <button onclick="viewTaskReport('${task.task_id}')" class="view-report-btn">View Report</button>
-                `;
-                container.appendChild(div);
-            });
-        }
-        
-        function viewTaskReport(taskId) {
-            // Open task report in a new window
-            window.open(`/api/task-report/${taskId}`, '_blank');
-        }
-        
-        // Auto-refresh every 30 seconds
-        setInterval(refreshData, 30000);
-        
-        // Initial load
-        window.onload = refreshData;
-    </script>
-</head>
-<body>
-    <div class="container">
-        <h1>Benchmark Monitor Dashboard</h1>
-        
-        <div class="card">
-            <h2>Overall Progress</h2>
-            <div class="progress-bar">
-                <div class="progress-fill" id="progress-fill" style="width: 0%"></div>
-            </div>
-            <p>Progress: <span id="progress-pct">0%</span></p>
-        </div>
-        
-        <div class="card">
-            <h2>Key Metrics</h2>
-            <div class="metric">
-                <div class="metric-value" id="total-tasks">0</div>
-                <div class="metric-label">Total Tasks</div>
-            </div>
-            <div class="metric">
-                <div class="metric-value" id="completed-tasks">0</div>
-                <div class="metric-label">Completed</div>
-            </div>
-            <div class="metric">
-                <div class="metric-value" id="running-tasks">0</div>
-                <div class="metric-label">Running</div>
-            </div>
-            <div class="metric">
-                <div class="metric-value" id="failed-tasks">0</div>
-                <div class="metric-label">Failed</div>
-            </div>
-            <div class="metric">
-                <div class="metric-value" id="accuracy">0%</div>
-                <div class="metric-label">Accuracy</div>
-            </div>
-        </div>
-        
-        <div class="card">
-            <h2>Recent Tasks</h2>
-            <button class="refresh-btn" onclick="refreshData()">Refresh</button>
-            <div class="task-list" id="task-list">
-                Loading...
-            </div>
-        </div>
-    </div>
-</body>
-</html>
-                """
-
-        return DashboardHandler
-
-
-class AdvancedBenchmarkMonitor:
-    """GAIA benchmark monitor with web interface"""
-
-    def __init__(self, log_folder: str):
-        self.log_folder = Path(log_folder)
-        self.start_time = datetime.now()
-        # Alerts removed per user request
-
-        # Statistics tracking
-        self.stats = {
-            "total_tasks": 0,
-            "completed_tasks": 0,
-            "running_tasks": 0,
-            "failed_tasks": 0,
-            "correct_answers": 0,
-            "incorrect_answers": 0,
-            "execution_times": [],
-            "error_types": {},
-            "task_types": {},
-            "last_update": None,
-        }
-
-        self.tasks = {}
-        self.recent_activity = []
-        self._generate_gaia_report_module = None
-
-    def _load_generate_gaia_report_module(self):
-        """Lazy load the generate_gaia_report module"""
-        if self._generate_gaia_report_module is None:
-            import importlib.util
-
-            spec = importlib.util.spec_from_file_location(
-                "generate_gaia_report",
-                os.path.join(os.path.dirname(__file__), "generate_gaia_report.py"),
-            )
-            if spec is None or spec.loader is None:
-                return None
-            self._generate_gaia_report_module = importlib.util.module_from_spec(spec)
-            spec.loader.exec_module(self._generate_gaia_report_module)
-        return self._generate_gaia_report_module
-
-    def scan_log_files(self) -> List[Path]:
-        """Scan for all task log files"""
-        if not self.log_folder.exists():
-            return []
-        return sorted(
-            self.log_folder.glob("task_*_attempt_*.json"),
-            key=lambda x: x.stat().st_mtime,
-            reverse=True,
-        )
-
-    def parse_task_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
-        """Parse a single task log file"""
-        try:
-            with open(file_path, "r", encoding="utf-8") as f:
-                return json.load(f)
-        except (json.JSONDecodeError, FileNotFoundError, KeyError):
-            return None
-
-    def extract_task_info(
-        self, data: Dict[str, Any], file_path: Path
-    ) -> Dict[str, Any]:
-        """Extract relevant information from task data"""
-        task_id = data.get("task_id", "unknown")
-        status = data.get("status", "unknown").lower()
-        judge_result = data.get("judge_result", "").upper()
-        final_answer = data.get("final_boxed_answer", "")
-        error_msg = data.get("error", "")
-
-        # Extract execution time
-        start_time = data.get("start_time")
-        end_time = data.get("end_time")
-        execution_time = None
-
-        if start_time and end_time:
-            try:
-                start_dt = datetime.fromisoformat(start_time.replace("Z", "+00:00"))
-                end_dt = datetime.fromisoformat(end_time.replace("Z", "+00:00"))
-                execution_time = (end_dt - start_dt).total_seconds()
-            except Exception:
-                pass
-
-        # Extract task type from metadata
-        task_type = ""
-        metadata = data.get("metadata", {})
-        if isinstance(metadata, dict):
-            # Try to get task type from various metadata fields
-            if "Level" in metadata:
-                task_type = f"Level {metadata['Level']}"
-            elif "task_type" in metadata:
-                task_type = str(metadata["task_type"])
-            elif "type" in metadata:
-                task_type = str(metadata["type"])
-            elif "difficulty" in metadata:
-                task_type = f"Difficulty {metadata['difficulty']}"
-
-        return {
-            "task_id": task_id,
-            "file_path": str(file_path),
-            "status": status,
-            "judge_result": judge_result,
-            "final_answer": final_answer,
-            "error": error_msg,
-            "execution_time": execution_time,
-            "task_type": task_type,
-            "last_modified": file_path.stat().st_mtime,
-        }
-
-    def update_statistics(self, task_info: Dict[str, Any]):
-        """Update monitoring statistics and check for alerts"""
-        task_id = task_info["task_id"]
-        status = task_info["status"]
-        judge_result = task_info["judge_result"]
-        execution_time = task_info["execution_time"]
-
-        # Update task tracking
-        if task_id not in self.tasks:
-            self.tasks[task_id] = task_info
-            self.stats["total_tasks"] += 1
-            # Only count status for new tasks
-            if status == "completed":
-                self.stats["completed_tasks"] += 1
-                if judge_result == "CORRECT":
-                    self.stats["correct_answers"] += 1
-                elif judge_result in ["INCORRECT", "ERROR"]:
-                    self.stats["incorrect_answers"] += 1
-            elif status == "running":
-                self.stats["running_tasks"] += 1
-            elif status in ["failed", "error", "interrupted"]:
-                self.stats["failed_tasks"] += 1
-        else:
-            # Update existing task - only update if status changed
-            old_status = self.tasks[task_id]["status"]
-            if old_status != status:
-                self.recent_activity.append(
-                    {
-                        "task_id": task_id,
-                        "old_status": old_status,
-                        "new_status": status,
-                        "timestamp": datetime.now(),
-                    }
-                )
-
-                # Decrease old status count
-                if old_status == "completed":
-                    self.stats["completed_tasks"] -= 1
-                    old_judge_result = self.tasks[task_id]["judge_result"]
-                    if old_judge_result == "CORRECT":
-                        self.stats["correct_answers"] -= 1
-                    elif old_judge_result in ["INCORRECT", "ERROR"]:
-                        self.stats["incorrect_answers"] -= 1
-                elif old_status == "running":
-                    self.stats["running_tasks"] -= 1
-                elif old_status in ["failed", "error", "interrupted"]:
-                    self.stats["failed_tasks"] -= 1
-
-                # Increase new status count
-                if status == "completed":
-                    self.stats["completed_tasks"] += 1
-                    if judge_result == "CORRECT":
-                        self.stats["correct_answers"] += 1
-                    elif judge_result in ["INCORRECT", "ERROR"]:
-                        self.stats["incorrect_answers"] += 1
-                elif status == "running":
-                    self.stats["running_tasks"] += 1
-                elif status in ["failed", "error", "interrupted"]:
-                    self.stats["failed_tasks"] += 1
-
-            self.tasks[task_id] = task_info
-
-        # Track execution times
-        if execution_time is not None:
-            self.stats["execution_times"].append(execution_time)
-            if len(self.stats["execution_times"]) > 100:
-                self.stats["execution_times"] = self.stats["execution_times"][-100:]
-
-        # Alerts removed; no checks performed
-
-    def get_status_json(self) -> Dict[str, Any]:
-        """Get current status as JSON for web interface"""
-        total = self.stats["total_tasks"]
-        completed = self.stats["completed_tasks"]
-        running = self.stats["running_tasks"]
-        failed = self.stats["failed_tasks"]
-
-        progress_pct = (completed / total * 100) if total > 0 else 0
-        progress_pct = min(progress_pct, 100.0)  # Cap at 100%
-
-        total_judged = self.stats["correct_answers"] + self.stats["incorrect_answers"]
-        accuracy = (
-            (self.stats["correct_answers"] / total_judged * 100)
-            if total_judged > 0
-            else 0
-        )
-
-        exec_times = self.stats["execution_times"]
-        avg_execution_time = sum(exec_times) / len(exec_times) if exec_times else 0
-
-        elapsed_time = (datetime.now() - self.start_time).total_seconds()
-        tasks_per_second = completed / elapsed_time if elapsed_time > 0 else 0
-
-        return {
-            "total_tasks": total,
-            "completed_tasks": completed,
-            "running_tasks": running,
-            "failed_tasks": failed,
-            "progress_pct": progress_pct,
-            "accuracy": accuracy,
-            "avg_execution_time": avg_execution_time,
-            "tasks_per_second": tasks_per_second,
-            "last_update": self.stats["last_update"].isoformat()
-            if self.stats["last_update"]
-            else None,
-        }
-
-    def get_tasks_json(self) -> List[Dict[str, Any]]:
-        """Get tasks list as JSON for web interface"""
-        return [
-            {
-                "task_id": task_info["task_id"],
-                "status": task_info["status"],
-                "judge_result": task_info["judge_result"],
-                "task_type": task_info["task_type"],
-                "execution_time": task_info["execution_time"],
-            }
-            for task_info in sorted(
-                self.tasks.values(), key=lambda x: x["last_modified"], reverse=True
-            )
-        ]
-
-    def scan_and_update(self):
-        """Scan log files and update statistics"""
-        log_files = self.scan_log_files()
-
-        for file_path in log_files:
-            data = self.parse_task_file(file_path)
-            if data:
-                task_info = self.extract_task_info(data, file_path)
-                self.update_statistics(task_info)
-
-        self.stats["last_update"] = datetime.now()
-
-    def get_task_info(self, task_id: str) -> Optional[Dict[str, Any]]:
-        """Get information about a specific task"""
-        return self.tasks.get(task_id)
-
-    def generate_task_report(self, task_id: str) -> Optional[str]:
-        """Generate the original simple report (no execution details)."""
-        try:
-            # Import the original report generator (now in the same directory)
-            generate_module = self._load_generate_gaia_report_module()
-            if generate_module is None:
-                return None
-            generate_task_report = generate_module.generate_task_report
-
-            # Map task_id to dataset index
-            task_index = self.find_task_index_in_dataset(task_id)
-            if task_index is None:
-                return None
-
-            # Generate and return the plain report content
-            report_path = generate_task_report(task_index)
-            if report_path and os.path.exists(report_path):
-                with open(report_path, "r", encoding="utf-8") as f:
-                    return f.read()
-            return None
-
-        except Exception as e:
-            print(f"Error generating simple report for task {task_id}: {e}")
-            return None
-
-    def find_task_index_in_dataset(self, task_id: str) -> Optional[int]:
-        """Find the index of a task in the GAIA dataset"""
-        try:
-            # Import from the same directory
-            generate_module = self._load_generate_gaia_report_module()
-            if generate_module is None:
-                return None
-            load_gaia_data = generate_module.load_gaia_data
-
-            # Load GAIA data
-            tasks = load_gaia_data()
-
-            # Find the task by ID
-            for i, task in enumerate(tasks):
-                if task.get("task_id") == task_id:
-                    return i
-
-            return None
-
-        except Exception as e:
-            print(f"Error finding task {task_id} in dataset: {e}")
-            return None
-
-
-def main():
-    parser = argparse.ArgumentParser(description="GAIA Benchmark Monitor")
-    parser.add_argument("log_folder", nargs="?", default=".", help="Path to log folder")
-    parser.add_argument("--web-port", type=int, default=8080, help="Web interface port")
-    # Alert functionality removed; threshold flag no longer supported
-
-    args = parser.parse_args()
-
-    if not Path(args.log_folder).exists():
-        print(f"Error: Log folder not found: {args.log_folder}")
-        return 1
-
-    # Create monitor
-    monitor = AdvancedBenchmarkMonitor(args.log_folder)
-
-    # Start web dashboard
-    dashboard = WebDashboard(monitor, args.web_port)
-    dashboard.start_server()
-
-    print("GAIA Benchmark Monitor started")
-    print(f"Web dashboard: http://localhost:{args.web_port}")
-    print("Press Ctrl+C to stop")
-
-    try:
-        while True:
-            monitor.scan_and_update()
-            time.sleep(30)  # Update every 30 seconds
-    except KeyboardInterrupt:
-        print("\nMonitor stopped by user")
-
-    return 0
-
-
-if __name__ == "__main__":
-    exit(main())
diff --git a/utils/progress_check/generate_gaia_report.py b/utils/progress_check/generate_gaia_report.py
deleted file mode 100644
index 5be2d19..0000000
--- a/utils/progress_check/generate_gaia_report.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/env python3
-"""
-GAIA Dataset Task Report Generator
-
-This script generates detailed text reports for specified tasks in the GAIA-val dataset.
-"""
-
-import json
-import os
-import sys
-
-
-def find_gaia_data_dir():
-    """Find GAIA data directory automatically"""
-    # Get the directory where this script is located (utils/progress_check/)
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    # Project root is two levels up from utils/progress_check/
-    repo_root = os.path.abspath(os.path.join(script_dir, "..", ".."))
-
-    # Try common locations
-    possible_paths = [
-        os.path.join(repo_root, "data", "gaia-val"),  # Project root/data/gaia-val
-        os.path.join(
-            script_dir, "..", "data", "gaia-val"
-        ),  # utils/data/gaia-val (unlikely)
-        os.path.join(
-            script_dir, "data", "gaia-val"
-        ),  # utils/progress_check/data/gaia-val (unlikely)
-        "data/gaia-val",  # Relative from current working directory
-    ]
-
-    for path in possible_paths:
-        abs_path = os.path.abspath(path)
-        jsonl_path = os.path.join(abs_path, "standardized_data.jsonl")
-        if os.path.exists(jsonl_path):
-            return abs_path
-
-    # If not found, return default path (project root/data/gaia-val)
-    return os.path.join(repo_root, "data", "gaia-val")
-
-
-def load_gaia_data(data_dir=None):
-    """Load GAIA validation dataset"""
-    if data_dir is None:
-        data_dir = find_gaia_data_dir()
-
-    jsonl_path = os.path.join(data_dir, "standardized_data.jsonl")
-
-    if not os.path.exists(jsonl_path):
-        print(f"❌ Error: GAIA data file not found at {jsonl_path}")
-        print("Please ensure the GAIA dataset is available in one of these locations:")
-        print("- data/gaia-val/standardized_data.jsonl")
-        print("- ../data/gaia-val/standardized_data.jsonl")
-        print("- Or specify the correct path using --data-dir argument")
-        sys.exit(1)
-
-    tasks = []
-    with open(jsonl_path, "r", encoding="utf-8") as f:
-        for line in f:
-            if line.strip():
-                tasks.append(json.loads(line))
-
-    return tasks
-
-
-def _default_reports_dir() -> str:
-    """Return absolute path to the default GAIA reports directory."""
-    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
-    reports_dir = os.path.join(repo_root, "gaia_reports")
-    return reports_dir
-
-
-def generate_task_report(task_index, data_dir=None, output_dir=None):
-    """Generate detailed text report for specified task"""
-    print("🚀 Loading GAIA dataset...")
-    tasks = load_gaia_data(data_dir)
-
-    display_index = task_index + 1
-
-    if task_index >= len(tasks):
-        print(
-            f"❌ Error: Task index {display_index} out of range, dataset has {len(tasks)} tasks"
-        )
-        return None
-
-    print(f"📄 Generating task {display_index} report...")
-
-    # Get task data
-    task = tasks[task_index]
-
-    # Set output directory (default to <repo_root>/gaia_reports)
-    if output_dir is None:
-        output_dir = _default_reports_dir()
-
-    # Ensure the directory exists
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Generate report file
-    report_path = os.path.join(output_dir, f"gaia_task_{display_index}_report.txt")
-
-    with open(report_path, "w", encoding="utf-8") as f:
-        f.write("=" * 80 + "\n")
-        f.write(f"GAIA Dataset Task {display_index} Detailed Report\n")
-        f.write("=" * 80 + "\n\n")
-
-        # Basic information
-        f.write("1. Task Basic Information\n")
-        f.write("-" * 40 + "\n")
-        f.write(f"Task ID: {task['task_id']}\n")
-        f.write(f"Difficulty Level: Level {task['metadata']['Level']}\n")
-        f.write(f"File Attachment: {'Yes' if task.get('file_path') else 'No'}\n")
-        if task.get("file_path"):
-            f.write(f"File Path: {task['file_path']}\n")
-        f.write("\n")
-
-        # Question content
-        f.write("2. Question Content\n")
-        f.write("-" * 40 + "\n")
-        f.write(f"{task['task_question']}\n\n")
-
-        # Ground truth answer
-        f.write("3. Ground Truth Answer\n")
-        f.write("-" * 40 + "\n")
-        f.write(f"{task['ground_truth']}\n\n")
-
-        # Solution steps
-        f.write("4. Detailed Solution Steps\n")
-        f.write("-" * 40 + "\n")
-        f.write(f"{task['metadata']['Annotator Metadata']['Steps']}\n\n")
-
-        # Metadata
-        f.write("5. Task Metadata\n")
-        f.write("-" * 40 + "\n")
-        metadata = task["metadata"]["Annotator Metadata"]
-        for key, value in metadata.items():
-            if key != "Steps":  # Skip Steps since it's shown in section 4
-                if key == "Tools":
-                    f.write(f"{key}:\n{value}\n\n")
-                else:
-                    f.write(f"{key}: {value}\n\n")
-        f.write("\n")
-
-        f.write("=" * 80 + "\n")
-        f.write("End of Report\n")
-        f.write("=" * 80 + "\n")
-
-    print(f"📄 Task {display_index} detailed report saved to: {report_path}")
-
-    return report_path
-
-
-def main():
-    """Main function"""
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Generate GAIA dataset task reports")
-    parser.add_argument(
-        "task_index",
-        nargs="?",
-        type=int,
-        default=1,
-        help="Task index to generate report for (1-based, default: 1)",
-    )
-    parser.add_argument(
-        "--data-dir",
-        type=str,
-        default=None,
-        help="Path to GAIA data directory (auto-detected if not specified)",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default=None,
-        help="Output directory for reports (default: <repo_root>/gaia_reports)",
-    )
-
-    args = parser.parse_args()
-
-    task_index = args.task_index - 1  # Convert to 0-based for internal use
-
-    generate_task_report(task_index, args.data_dir, args.output_dir)
-
-
-if __name__ == "__main__":
-    main()

From b569c1844910318215cf4c8cd372c7e90a8ea185 Mon Sep 17 00:00:00 2001
From: JoeXic <joexie@yeah.net>
Date: Mon, 10 Nov 2025 22:37:41 +0000
Subject: [PATCH 11/11] add import

---
 utils/progress_check/check_finsearchcomp_progress.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/progress_check/check_finsearchcomp_progress.py b/utils/progress_check/check_finsearchcomp_progress.py
index 1104582..c96e9cc 100755
--- a/utils/progress_check/check_finsearchcomp_progress.py
+++ b/utils/progress_check/check_finsearchcomp_progress.py
@@ -21,7 +21,7 @@
 import re
 import sys
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Any
 
 
 def extract_task_type(task_id: str) -> str:
@@ -61,7 +61,7 @@ def extract_region_from_label(label: str) -> str:
         return "Unknown"
 
 
-def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
+def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, Any]:
     """
     Analyze FinSearchComp benchmark results from JSON log files.
 
@@ -192,7 +192,7 @@ def analyze_finsearchcomp_results(log_folder: str) -> Dict[str, any]:
 
 
 def display_results(
-    results: Dict[str, any],
+    results: Dict[str, Any],
     correct_files: List[str],
     incorrect_files: List[Tuple[str, str]],
     error_files: List[Tuple[str, str]],