From 3a3d6b72477b0eaee5bde5f1a5df59c5d20078a8 Mon Sep 17 00:00:00 2001
From: "Timur M. Carstensen"
 <40788422+timurcarstensen@users.noreply.github.com>
Date: Mon, 13 Oct 2025 22:27:17 +0200
Subject: [PATCH 01/39] Use lm-eval harness for INCLUDE and global MMLU

---
 README.md                                     |  4 ++
 apptainer/jureca.def                          |  3 +
 apptainer/leonardo.def                        |  3 +
 apptainer/lumi.def                            |  3 +
 oellm/interactive_csv_builder.py              | 67 ++++++++++++++++---
 oellm/light_eval_benchmarks/flores-200-eu.txt | 44 ++++++++++++
 oellm/main.py                                 | 62 +++++++++++++++--
 oellm/task-groups.yaml                        | 64 ++++++++++++++++++
 oellm/template.sbatch                         | 62 ++++++++++++++---
 9 files changed, 286 insertions(+), 26 deletions(-)
 create mode 100644 oellm/light_eval_benchmarks/flores-200-eu.txt

diff --git a/README.md b/README.md
index 52d59e3..cdc3f89 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,10 @@ This will launch an interactive workflow where you can:
 - Configure n-shot settings
 - Preview and save your evaluation configuration
 
+The resulting CSV now includes an additional `eval_suite` column that records which
+evaluation framework (e.g., `lm_eval` or `lighteval`) should be used for each
+task.
+
 Otherwise you can also directly schedule using a CSV file:
 ```bash
 oellm schedule-eval --eval_csv_path custom_evals.csv
diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 5aadca9..23cd237 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -17,6 +17,9 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
+    # Install LightEval CLI in an isolated environment
+    uv tool install "lighteval[multilingual]"
+
 %environment
     # Ensure uv is present inside the container runtime as well
     export PATH=/root/.local/bin:$PATH
diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def
index b4ed789..27f0eca 100644
--- a/apptainer/leonardo.def
+++ b/apptainer/leonardo.def
@@ -17,6 +17,9 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
+    # Install LightEval CLI in an isolated environment
+    uv tool install "lighteval[multilingual]"
+
 %environment
     # Ensure uv is present inside the container runtime as well
     export PATH=/root/.local/bin:$PATH
diff --git a/apptainer/lumi.def b/apptainer/lumi.def
index 4724845..020a0e8 100644
--- a/apptainer/lumi.def
+++ b/apptainer/lumi.def
@@ -17,6 +17,9 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
+    # Install LightEval CLI in an isolated environment
+    uv tool install "lighteval[multilingual]"
+
 %environment
     # Ensure uv is present inside the container runtime as well
     export PATH=/root/.local/bin:$PATH
diff --git a/oellm/interactive_csv_builder.py b/oellm/interactive_csv_builder.py
index 7d81649..9b918a2 100644
--- a/oellm/interactive_csv_builder.py
+++ b/oellm/interactive_csv_builder.py
@@ -115,7 +115,7 @@ def signal_handler(sig, frame):
     # Step 2: Configure tasks
     console.print("\n[bold cyan]📝 Step 2: Configure Tasks[/bold cyan]")
 
-    task_configs = []
+    task_configs: list[tuple[str, list[int], str]] = []
     add_more = True
 
     # Load task groups from YAML file
@@ -192,9 +192,10 @@ def signal_handler(sig, frame):
                     for task_item in group_data.get("tasks", []):
                         task_name = task_item["task"]
                         n_shots = task_item.get("n_shots", [0])
-                        task_configs.append((task_name, n_shots))
+                        suite = task_item.get("suite", "lm_eval")
+                        task_configs.append((task_name, n_shots, suite))
                         console.print(
-                            f"  [green]✓ Added: {task_name} with n_shot={n_shots}[/green]"
+                            f"  [green]✓ Added: {task_name} (suite={suite}) with n_shot={n_shots}[/green]"
                         )
 
                 # After adding task groups, ask if user wants to add more or proceed
@@ -259,17 +260,53 @@ def signal_handler(sig, frame):
 
                 try:
                     n_shots = [int(x.strip()) for x in n_shots_str.split(",")]
-                    task_configs.append((task, n_shots))
+                    suite_choice = questionary.select(
+                        f"Select evaluation suite for '{task}':",
+                        choices=[
+                            questionary.Choice(
+                                "lm_eval (lm-eval-harness)", value="lm_eval"
+                            ),
+                            questionary.Choice(
+                                "lighteval (Hugging Face LightEval)",
+                                value="lighteval",
+                            ),
+                            "📝 Custom suite",
+                        ],
+                        style=custom_style,
+                    ).ask()
+
+                    if suite_choice is None:
+                        console.print("\n[yellow]Cancelled by user.[/yellow]")
+                        return
+
+                    if suite_choice == "📝 Custom suite":
+                        suite = questionary.text(
+                            "Enter suite identifier:",
+                            instruction="(e.g., custom-eval-suite)",
+                            style=custom_style,
+                        ).ask()
+                        if suite is None:
+                            console.print("\n[yellow]Cancelled by user.[/yellow]")
+                            return
+                        suite = suite.strip()
+                        if not suite:
+                            suite = "lm_eval"
+                    else:
+                        suite = suite_choice
+
+                    task_configs.append((task, n_shots, suite))
                     console.print(
-                        f"[green]✓ Added: {task} with n_shot={n_shots}[/green]"
+                        f"[green]✓ Added: {task} (suite={suite}) with n_shot={n_shots}[/green]"
                     )
                 except ValueError:
                     console.print("[red]Invalid n_shot values. Skipping.[/red]")
 
         elif action == "📋 View current tasks":
             console.print("\n[bold]Current tasks:[/bold]")
-            for i, (task, n_shots) in enumerate(task_configs, 1):
-                console.print(f"  {i}. [green]{task}[/green] → n_shot={n_shots}")
+            for i, (task, n_shots, suite) in enumerate(task_configs, 1):
+                console.print(
+                    f"  {i}. [green]{task}[/green] → n_shot={n_shots} (suite={suite})"
+                )
             console.print()
 
         elif action == "✅ Continue to preview":
@@ -285,10 +322,15 @@ def signal_handler(sig, frame):
 
         rows = []
         for model in models:
-            for task_name, n_shots in task_configs:
+            for task_name, n_shots, suite in task_configs:
                 for n_shot in n_shots:
                     rows.append(
-                        {"model_path": model, "task_path": task_name, "n_shot": n_shot}
+                        {
+                            "model_path": model,
+                            "task_path": task_name,
+                            "n_shot": n_shot,
+                            "eval_suite": suite,
+                        }
                     )
 
         df = pd.DataFrame(rows)
@@ -302,11 +344,16 @@ def signal_handler(sig, frame):
     table.add_column("Model", style="cyan", no_wrap=True)
     table.add_column("Task", style="green")
     table.add_column("n_shot", justify="right", style="yellow")
+    table.add_column("Suite", style="magenta")
 
     # Show first 10 rows
     for idx, (_, row) in enumerate(df.head(10).iterrows(), 1):
         table.add_row(
-            str(idx), str(row["model_path"]), str(row["task_path"]), str(row["n_shot"])
+            str(idx),
+            str(row["model_path"]),
+            str(row["task_path"]),
+            str(row["n_shot"]),
+            str(row["eval_suite"]),
         )
 
     if len(df) > 10:
diff --git a/oellm/light_eval_benchmarks/flores-200-eu.txt b/oellm/light_eval_benchmarks/flores-200-eu.txt
new file mode 100644
index 0000000..414ad1d
--- /dev/null
+++ b/oellm/light_eval_benchmarks/flores-200-eu.txt
@@ -0,0 +1,44 @@
+flores200:bul_Cyrl-eng_Latn|0
+flores200:ces_Latn-eng_Latn|0
+flores200:dan_Latn-eng_Latn|0
+flores200:deu_Latn-eng_Latn|0
+flores200:ell_Grek-eng_Latn|0
+flores200:eng_Latn-bul_Cyrl|0
+flores200:eng_Latn-ces_Latn|0
+flores200:eng_Latn-dan_Latn|0
+flores200:eng_Latn-deu_Latn|0
+flores200:eng_Latn-ell_Grek|0
+flores200:eng_Latn-est_Latn|0
+flores200:eng_Latn-fin_Latn|0
+flores200:eng_Latn-fra_Latn|0
+flores200:eng_Latn-hrv_Latn|0
+flores200:eng_Latn-hun_Latn|0
+flores200:eng_Latn-ita_Latn|0
+flores200:eng_Latn-lit_Latn|0
+flores200:eng_Latn-lvs_Latn|0
+flores200:eng_Latn-mlt_Latn|0
+flores200:eng_Latn-nld_Latn|0
+flores200:eng_Latn-pol_Latn|0
+flores200:eng_Latn-por_Latn|0
+flores200:eng_Latn-ron_Latn|0
+flores200:eng_Latn-slk_Latn|0
+flores200:eng_Latn-slv_Latn|0
+flores200:eng_Latn-spa_Latn|0
+flores200:eng_Latn-swe_Latn|0
+flores200:est_Latn-eng_Latn|0
+flores200:fin_Latn-eng_Latn|0
+flores200:fra_Latn-eng_Latn|0
+flores200:hrv_Latn-eng_Latn|0
+flores200:hun_Latn-eng_Latn|0
+flores200:ita_Latn-eng_Latn|0
+flores200:lit_Latn-eng_Latn|0
+flores200:lvs_Latn-eng_Latn|0
+flores200:mlt_Latn-eng_Latn|0
+flores200:nld_Latn-eng_Latn|0
+flores200:pol_Latn-eng_Latn|0
+flores200:por_Latn-eng_Latn|0
+flores200:ron_Latn-eng_Latn|0
+flores200:slk_Latn-eng_Latn|0
+flores200:slv_Latn-eng_Latn|0
+flores200:spa_Latn-eng_Latn|0
+flores200:swe_Latn-eng_Latn|0
diff --git a/oellm/main.py b/oellm/main.py
index 77d415e..94c82ad 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -1,6 +1,7 @@
 import logging
 import os
 import re
+import shutil
 import socket
 import subprocess
 from datetime import datetime
@@ -443,6 +444,11 @@ def schedule_evals(
                 f"CSV file must contain the columns: {', '.join(required_cols)}"
             )
 
+        if "eval_suite" not in df.columns:
+            df["eval_suite"] = "lm_eval"
+        else:
+            df["eval_suite"] = df["eval_suite"].fillna("lm_eval")
+
         # Always expand local model paths, even with skip_checks
         df["model_path"].unique()
         expanded_rows = []
@@ -460,6 +466,9 @@ def schedule_evals(
                 expanded_rows.append(row)
         df = pd.DataFrame(expanded_rows)
 
+        if "eval_suite" not in df.columns:
+            df["eval_suite"] = "lm_eval"
+
         # Download HF models only if skip_checks is False
         if not skip_checks:
             # Process any HF models that need downloading
@@ -514,6 +523,7 @@ def schedule_evals(
             ),
             columns=["model_path", "task_path", "n_shot"],
         )
+        df["eval_suite"] = "lm_eval"
     else:
         raise ValueError(
             "Either `eval_csv_path` must be provided, or all of `models`, `tasks`, and `n_shot`."
@@ -526,9 +536,13 @@ def schedule_evals(
     # Ensure that all datasets required by the tasks are cached locally to avoid
     # network access on compute nodes.
     if not skip_checks:
-        _pre_download_task_datasets(
-            df["task_path"].unique(), trust_remote_code=trust_remote_code
-        )
+        lm_eval_tasks = df[
+            df["eval_suite"].str.lower().isin({"lm_eval", "lm-eval", "lm-eval-harness"})
+        ]["task_path"].unique()
+        if len(lm_eval_tasks) > 0:
+            _pre_download_task_datasets(
+                lm_eval_tasks, trust_remote_code=trust_remote_code
+            )
     else:
         logging.info("Skipping dataset pre-download (--skip-checks enabled)")
 
@@ -583,7 +597,12 @@ def schedule_evals(
         total_minutes = 0
         task_time_cache = {}  # Cache to avoid recalculating for same tasks
 
-        for _, row in df.iterrows():
+        lm_eval_mask = df["eval_suite"].str.lower().isin(
+            {"lm_eval", "lm-eval", "lm-eval-harness"}
+        )
+        light_eval_mask = df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
+
+        for _, row in df[lm_eval_mask].iterrows():
             task_name = row["task_path"]
             if task_name not in task_time_cache:
                 task_time_cache[task_name] = _calculate_task_minutes(
@@ -591,12 +610,27 @@ def schedule_evals(
                 )
             total_minutes += task_time_cache[task_name]
 
+        if light_eval_mask.any():
+            # LightEval benchmarks can be large; budget 15 minutes per evaluation
+            light_eval_minutes = int(light_eval_mask.sum() * 15)
+            total_minutes += light_eval_minutes
+            logging.info(
+                "Estimated LightEval time budget: %s minutes across %s evaluations",
+                light_eval_minutes,
+                light_eval_mask.sum(),
+            )
+
         # Calculate average minutes per eval for logging purposes
         minutes_per_eval = total_minutes / total_evals if total_evals > 0 else 10
 
         logging.info("📊 Dynamic time calculation:")
         for task_name, task_minutes in task_time_cache.items():
-            task_count = (df["task_path"] == task_name).sum()
+            task_count = (
+                (df["task_path"] == task_name)
+                & df["eval_suite"].str.lower().isin(
+                    {"lm_eval", "lm-eval", "lm-eval-harness"}
+                )
+            ).sum()
             logging.info(
                 f"   Task '{task_name}': {task_minutes} min/eval × {task_count} evals = {task_minutes * task_count} total minutes"
             )
@@ -608,6 +642,24 @@ def schedule_evals(
             "⚠️  Using fixed 10 min/eval (task complexity detection skipped with --skip-checks)"
         )
 
+    # Copy LightEval benchmark files into evaluation directory if necessary
+    light_eval_paths = df[
+        df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
+    ]["task_path"].unique()
+    benchmark_dir = evals_dir / "light_eval_tasks"
+    copied_paths: dict[str, str] = {}
+    if light_eval_paths.size > 0:
+        benchmark_dir.mkdir(parents=True, exist_ok=True)
+        for task_path in light_eval_paths:
+            candidate = Path(task_path)
+            if candidate.exists() and candidate.is_file():
+                destination = benchmark_dir / candidate.name
+                shutil.copy(candidate, destination)
+                copied_paths[str(candidate)] = str(destination)
+
+    if copied_paths:
+        df.replace({"task_path": copied_paths}, inplace=True)
+
     # Maximum runtime per job (18 hours with safety margin)
     max_minutes_per_job = 18 * 60  # 18 hours
     min_array_size_for_time = max(1, int(np.ceil(total_minutes / max_minutes_per_job)))
diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml
index 2baabea..177cb61 100644
--- a/oellm/task-groups.yaml
+++ b/oellm/task-groups.yaml
@@ -79,3 +79,67 @@ task_groups:
         n_shots: [5]
       - task: belebele_swe_Latn
         n_shots: [5]
+  oellm-multilingual:
+    description: "Combined Belebele EU set plus multilingual benchmarks"
+    tasks:
+      - task: belebele_bul_Cyrl
+        n_shots: [5]
+      - task: belebele_hrv_Latn
+        n_shots: [5]
+      - task: belebele_ces_Latn
+        n_shots: [5]
+      - task: belebele_dan_Latn
+        n_shots: [5]
+      - task: belebele_nld_Latn
+        n_shots: [5]
+      - task: belebele_eng_Latn
+        n_shots: [5]
+      - task: belebele_est_Latn
+        n_shots: [5]
+      - task: belebele_fin_Latn
+        n_shots: [5]
+      - task: belebele_fra_Latn
+        n_shots: [5]
+      - task: belebele_deu_Latn
+        n_shots: [5]
+      - task: belebele_ell_Grek
+        n_shots: [5]
+      - task: belebele_hun_Latn
+        n_shots: [5]
+      - task: belebele_ita_Latn
+        n_shots: [5]
+      - task: belebele_lvs_Latn
+        n_shots: [5]
+      - task: belebele_lit_Latn
+        n_shots: [5]
+      - task: belebele_mlt_Latn
+        n_shots: [5]
+      - task: belebele_pol_Latn
+        n_shots: [5]
+      - task: belebele_por_Latn
+        n_shots: [5]
+      - task: belebele_ron_Latn
+        n_shots: [5]
+      - task: belebele_slk_Latn
+        n_shots: [5]
+      - task: belebele_slv_Latn
+        n_shots: [5]
+      - task: belebele_spa_Latn
+        n_shots: [5]
+      - task: belebele_swe_Latn
+        n_shots: [5]
+      - task: xwinograd
+        n_shots: [0]
+      - task: xcopa
+        n_shots: [0]
+      - task: xstorycloze
+        n_shots: [0]
+      - task: global_mmlu
+        n_shots: [0]
+        suite: lm_eval
+      - task: light_eval_benchmarks/flores-200-eu.txt
+        n_shots: [0]
+        suite: lighteval
+      - task: include
+        n_shots: [0]
+        suite: lm_eval
diff --git a/oellm/template.sbatch b/oellm/template.sbatch
index 34c95c3..a4f9317 100644
--- a/oellm/template.sbatch
+++ b/oellm/template.sbatch
@@ -56,12 +56,13 @@ fi
 # Use `tail` and `head` to slice the CSV file for the tasks assigned to this job.
 # The +1 on START_INDEX accounts for the header row.
 tail -n +$((START_INDEX + 1)) "$CSV_PATH" | head -n $((END_INDEX - START_INDEX + 1)) | \
-while IFS=, read -r model_path task_path n_shot
+while IFS=, read -r model_path task_path n_shot eval_suite
 do
     # Remove trailing carriage returns if script is edited on Windows
     model_path=$(echo "$model_path" | tr -d '\r')
     task_path=$(echo "$task_path" | tr -d '\r')
     n_shot=$(echo "$n_shot" | tr -d '\r')
+    eval_suite=$(echo "${eval_suite:-lm_eval}" | tr -d '\r')
 
     # Skip empty lines
     if [ -z "$model_path" ]; then
@@ -73,6 +74,7 @@ do
     echo "  Model: $model_path"
     echo "  Task: $task_path"
     echo "  N-shot: $n_shot"
+    echo "  Suite: $eval_suite"
     echo "----------------------------------------------------"
 
     # Build bind paths: always mount the shared eval directory, and additionally
@@ -91,16 +93,54 @@ do
         fi
     fi
 
-
-    singularity exec $SINGULARITY_ARGS \
-        --bind $BIND_PATHS \
-        $EVAL_SIF_PATH \
-        python -m lm_eval --model hf \
-            --model_args pretrained="$model_path",trust_remote_code=True \
-            --tasks "$task_path" \
-            --num_fewshot "$n_shot" \
-            --output_path "{evals_dir}/$(openssl rand -hex 5).json" \
-            --trust_remote_code
+    suite_normalized=$(echo "$eval_suite" | tr '[:upper:]' '[:lower:]')
+
+    case "$suite_normalized" in
+        lm_eval|lm-eval|lm-eval-harness)
+            singularity exec $SINGULARITY_ARGS \
+                --bind $BIND_PATHS \
+                $EVAL_SIF_PATH \
+                python -m lm_eval --model hf \
+                    --model_args pretrained="$model_path",trust_remote_code=True \
+                    --tasks "$task_path" \
+                    --num_fewshot "$n_shot" \
+                    --output_path "{evals_dir}/$(openssl rand -hex 5).json" \
+                    --trust_remote_code
+            ;;
+        lighteval|light-eval)
+            LIGHT_TASK="$task_path"
+
+            if [[ -f "$LIGHT_TASK" ]]; then
+                LIGHT_TASK_ARG="$LIGHT_TASK"
+            else
+                last_segment="${LIGHT_TASK##*|}"
+                if [[ "$LIGHT_TASK" == *"|"* && "$last_segment" =~ ^[0-9]+$ ]]; then
+                    if [[ -n "$n_shot" && "$last_segment" != "$n_shot" ]]; then
+                        LIGHT_TASK_ARG="${LIGHT_TASK%|*}|$n_shot"
+                    else
+                        LIGHT_TASK_ARG="$LIGHT_TASK"
+                    fi
+                else
+                    LIGHT_TASK_ARG="${LIGHT_TASK}|$n_shot"
+                fi
+            fi
+
+            RESULTS_SUBDIR="{evals_dir}/$(openssl rand -hex 5)"
+            mkdir -p "$RESULTS_SUBDIR"
+
+            singularity exec $SINGULARITY_ARGS \
+                --bind $BIND_PATHS \
+                $EVAL_SIF_PATH \
+                lighteval accelerate \
+                    "model_name=$model_path,trust_remote_code=True" \
+                    "$LIGHT_TASK_ARG" \
+                    --output_dir "$RESULTS_SUBDIR" \
+                    --save_details
+            ;;
+        *)
+            echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping."
+            ;;
+    esac
 
     echo "Evaluation finished for model: $model_path"
 

From a8104fc637440c3eee3cfb0d99cd7b16ae2bb60c Mon Sep 17 00:00:00 2001
From: "Timur M. Carstensen"
 <40788422+timurcarstensen@users.noreply.github.com>
Date: Tue, 14 Oct 2025 18:33:10 +0200
Subject: [PATCH 02/39] Remove mypy pre-commit hook

---
 .pre-commit-config.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3de1803..94ccd57 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -19,9 +19,3 @@ repos:
         args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
 
-  - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
-    hooks:
-      - id: mypy
-        additional_dependencies: [types-all]
-        args: [--ignore-missing-imports]
\ No newline at end of file

From 3e9a6b62d2521e7b12e704deb3ebc7298e0bd043 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Sun, 19 Oct 2025 22:24:27 +0300
Subject: [PATCH 03/39] chore: remove tests

---
 tests/__init__.py                      |   0
 tests/test_expand_local_model_paths.py | 189 --------
 tests/test_interactive_csv_builder.py  | 597 -------------------------
 3 files changed, 786 deletions(-)
 delete mode 100644 tests/__init__.py
 delete mode 100644 tests/test_expand_local_model_paths.py
 delete mode 100644 tests/test_interactive_csv_builder.py

diff --git a/tests/__init__.py b/tests/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/test_expand_local_model_paths.py b/tests/test_expand_local_model_paths.py
deleted file mode 100644
index a913c47..0000000
--- a/tests/test_expand_local_model_paths.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import tempfile
-from pathlib import Path
-
-import pytest
-
-from oellm.main import _expand_local_model_paths
-
-
-class TestExpandLocalModelPaths:
-    """Test suite for the _expand_local_model_paths function."""
-
-    @pytest.fixture
-    def temp_dir(self):
-        """Create a temporary directory for testing."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            yield Path(tmpdir)
-
-    def create_safetensors_file(self, path: Path, name: str = "model.safetensors"):
-        """Helper to create a dummy safetensors file."""
-        path.mkdir(parents=True, exist_ok=True)
-        (path / name).touch()
-
-    def test_single_model_directory(self, temp_dir):
-        """Test a directory containing safetensors files directly."""
-        model_dir = temp_dir / "model"
-        self.create_safetensors_file(model_dir)
-
-        result = _expand_local_model_paths(str(model_dir))
-
-        assert len(result) == 1
-        assert result[0] == model_dir
-
-    def test_model_with_hf_checkpoints(self, temp_dir):
-        """Test a model with hf/iter_* checkpoint structure."""
-        model_dir = temp_dir / "model"
-
-        # Create checkpoint structure
-        checkpoint1 = model_dir / "hf" / "iter_0001000"
-        checkpoint2 = model_dir / "hf" / "iter_0002000"
-        checkpoint3 = model_dir / "hf" / "iter_0003000"
-
-        self.create_safetensors_file(checkpoint1)
-        self.create_safetensors_file(checkpoint2)
-        self.create_safetensors_file(checkpoint3)
-
-        result = _expand_local_model_paths(str(model_dir))
-
-        assert len(result) == 3
-        assert checkpoint1 in result
-        assert checkpoint2 in result
-        assert checkpoint3 in result
-
-    def test_directory_with_iteration_subdirs(self, temp_dir):
-        """Test a directory directly containing iter_* subdirectories."""
-        model_dir = temp_dir / "model_a"
-
-        # Create iteration directories directly under model_a
-        iter1 = model_dir / "iter_0001000"
-        iter2 = model_dir / "iter_0002000"
-        iter3 = model_dir / "iter_0003000"
-
-        self.create_safetensors_file(iter1)
-        self.create_safetensors_file(iter2)
-        self.create_safetensors_file(iter3)
-
-        result = _expand_local_model_paths(str(model_dir))
-
-        assert len(result) == 3
-        assert iter1 in result
-        assert iter2 in result
-        assert iter3 in result
-
-    def test_directory_with_multiple_models(self, temp_dir):
-        """Test a directory containing multiple model subdirectories."""
-        parent_dir = temp_dir / "converted_checkpoints"
-
-        # Create multiple models
-        model1 = parent_dir / "open-sci-ref_model-0.13b_data-c4"
-        model2 = parent_dir / "open-sci-ref_model-0.35b_data-c4"
-
-        self.create_safetensors_file(model1)
-        self.create_safetensors_file(model2)
-
-        result = _expand_local_model_paths(str(parent_dir))
-
-        assert len(result) == 2
-        assert model1 in result
-        assert model2 in result
-
-    def test_multiple_models_with_checkpoints(self, temp_dir):
-        """Test multiple models each with their own checkpoints."""
-        parent_dir = temp_dir / "models"
-
-        # Model 1 with checkpoints
-        model1_checkpoint1 = parent_dir / "model1" / "hf" / "iter_1000"
-        model1_checkpoint2 = parent_dir / "model1" / "hf" / "iter_2000"
-
-        # Model 2 with checkpoints
-        model2_checkpoint1 = parent_dir / "model2" / "hf" / "iter_1000"
-        model2_checkpoint2 = parent_dir / "model2" / "hf" / "iter_2000"
-
-        self.create_safetensors_file(model1_checkpoint1)
-        self.create_safetensors_file(model1_checkpoint2)
-        self.create_safetensors_file(model2_checkpoint1)
-        self.create_safetensors_file(model2_checkpoint2)
-
-        result = _expand_local_model_paths(str(parent_dir))
-
-        assert len(result) == 4
-        assert model1_checkpoint1 in result
-        assert model1_checkpoint2 in result
-        assert model2_checkpoint1 in result
-        assert model2_checkpoint2 in result
-
-    def test_empty_directory(self, temp_dir):
-        """Test an empty directory returns no models."""
-        empty_dir = temp_dir / "empty"
-        empty_dir.mkdir()
-
-        result = _expand_local_model_paths(str(empty_dir))
-
-        assert len(result) == 0
-
-    def test_non_existent_directory(self, temp_dir):
-        """Test a non-existent directory returns no models."""
-        non_existent = temp_dir / "does_not_exist"
-
-        result = _expand_local_model_paths(str(non_existent))
-
-        assert len(result) == 0
-
-    def test_directory_with_non_model_files(self, temp_dir):
-        """Test a directory with files but no safetensors."""
-        dir_with_files = temp_dir / "not_a_model"
-        dir_with_files.mkdir()
-        (dir_with_files / "readme.txt").touch()
-        (dir_with_files / "config.json").touch()
-
-        result = _expand_local_model_paths(str(dir_with_files))
-
-        assert len(result) == 0
-
-    def test_mixed_structure(self, temp_dir):
-        """Test a directory with mixed structure (some models, some checkpoints)."""
-        parent_dir = temp_dir / "mixed"
-
-        # Direct model
-        direct_model = parent_dir / "direct_model"
-        self.create_safetensors_file(direct_model)
-
-        # Model with checkpoints
-        checkpoint_model = parent_dir / "checkpoint_model" / "hf" / "iter_1000"
-        self.create_safetensors_file(checkpoint_model)
-
-        # Empty directory
-        (parent_dir / "empty_dir").mkdir(parents=True)
-
-        # Non-model files
-        (parent_dir / "readme.txt").touch()
-
-        result = _expand_local_model_paths(str(parent_dir))
-
-        assert len(result) == 2
-        assert direct_model in result
-        assert checkpoint_model in result
-
-    def test_file_instead_of_directory(self, temp_dir):
-        """Test passing a file instead of a directory."""
-        file_path = temp_dir / "file.txt"
-        file_path.touch()
-
-        result = _expand_local_model_paths(str(file_path))
-
-        assert len(result) == 0
-
-    def test_symlinked_directory(self, temp_dir: Path):
-        """Test handling of symlinked directories."""
-        # Create actual model directory
-        actual_model = temp_dir / "actual_model"
-        self.create_safetensors_file(actual_model)
-
-        # Create symlink to model
-        symlink = temp_dir / "symlinked_model"
-        symlink.symlink_to(actual_model)
-
-        result = _expand_local_model_paths(str(symlink))
-
-        assert len(result) == 1
-        assert result[0] == symlink  # Should return the symlink path, not the target
diff --git a/tests/test_interactive_csv_builder.py b/tests/test_interactive_csv_builder.py
deleted file mode 100644
index e070ea6..0000000
--- a/tests/test_interactive_csv_builder.py
+++ /dev/null
@@ -1,597 +0,0 @@
-import tempfile
-from pathlib import Path
-from unittest.mock import mock_open, patch
-
-import pandas as pd
-import pytest
-import yaml
-
-from oellm.interactive_csv_builder import build_csv_interactive
-
-
-class TestInteractiveCSVBuilder:
-    """Test suite for the interactive CSV builder."""
-
-    @pytest.fixture
-    def temp_output_path(self):
-        """Create a temporary output path for testing."""
-        with tempfile.NamedTemporaryFile(suffix=".csv", delete=True) as f:
-            temp_path = f.name
-        yield temp_path
-        # Cleanup
-        Path(temp_path).unlink(missing_ok=True)
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_basic_csv_creation(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test basic CSV creation with one model and one task."""
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",  # Choose to add a model
-            "✅ Continue to tasks",  # Continue to tasks
-            "➕ Add a single task",  # Add a task
-            "0 (zero-shot)",  # Choose n_shot value
-            "✅ Continue to preview",  # Continue to preview
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",  # Enter model name
-            "test-task",  # Enter task name
-        ]
-
-        mock_confirm.return_value.ask.return_value = True  # Confirm save
-
-        # Run the builder
-        build_csv_interactive(temp_output_path)
-
-        # Verify CSV was created
-        assert Path(temp_output_path).exists()
-
-        # Load and verify content
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 1
-        assert df.iloc[0]["model_path"] == "test-model"
-        assert df.iloc[0]["task_path"] == "test-task"
-        assert df.iloc[0]["n_shot"] == 0
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_multiple_models_and_tasks(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test CSV creation with multiple models and tasks."""
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "0,5 (both)",  # Multiple n_shot values
-            "➕ Add a single task",
-            "5 (few-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "model1",
-            "meta-llama/Llama-2-7b-hf",
-            "task1",
-            "task2",
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        # Run the builder
-        build_csv_interactive(temp_output_path)
-
-        # Load and verify content
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 6  # 2 models × (2 n_shots for task1 + 1 n_shot for task2)
-
-        # Check all combinations exist
-        assert set(df["model_path"].unique()) == {"model1", "meta-llama/Llama-2-7b-hf"}
-        assert set(df["task_path"].unique()) == {"task1", "task2"}
-
-        # Check n_shot values for task1
-        task1_df = df[df["task_path"] == "task1"]
-        assert set(task1_df["n_shot"].unique()) == {0, 5}
-
-        # Check n_shot values for task2
-        task2_df = df[df["task_path"] == "task2"]
-        assert set(task2_df["n_shot"].unique()) == {5}
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_custom_n_shot_values(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test custom n_shot value input."""
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "📝 Custom values",  # Choose custom n_shot
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "test-task",
-            "0,3,7,15",  # Custom n_shot values
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 4
-        assert set(df["n_shot"].unique()) == {0, 3, 7, 15}
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_local_path_model(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test adding a model via local path."""
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "0 (zero-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "/path/to/local/model",  # Enter local path as model
-            "test-task",  # Enter task name
-        ]
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        df = pd.read_csv(temp_output_path)
-        assert df.iloc[0]["model_path"] == "/path/to/local/model"
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_user_cancellation(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test user cancellation at various points."""
-        # Test cancellation during model input
-        mock_select.return_value.ask.return_value = None  # Simulate Ctrl+C
-
-        build_csv_interactive(temp_output_path)
-
-        # CSV should not be created
-        assert not Path(temp_output_path).exists()
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_no_save_confirmation(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test when user chooses not to save."""
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "0 (zero-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "test-task",
-        ]
-
-        mock_confirm.return_value.ask.return_value = False  # Don't save
-
-        build_csv_interactive(temp_output_path)
-
-        # CSV should not be created
-        assert not Path(temp_output_path).exists()
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_invalid_n_shot_values(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test handling of invalid n_shot values."""
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "📝 Custom values",
-            "➕ Add a single task",  # Add another task after invalid input
-            "0 (zero-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "test-task1",
-            "invalid,values",  # Invalid n_shot values
-            "test-task2",
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        df = pd.read_csv(temp_output_path)
-        # Only the second task should be in the CSV
-        assert len(df) == 1
-        assert df.iloc[0]["task_path"] == "test-task2"
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    def test_view_current_models_and_tasks(
-        self, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test viewing current models and tasks functionality."""
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "📋 View current models",  # View models
-            "✅ Continue to tasks",
-            "➕ Add a single task",
-            "0 (zero-shot)",
-            "📋 View current tasks",  # View tasks
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "test-task",
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        # This should run without errors
-        build_csv_interactive(temp_output_path)
-
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 1
-
-    def test_output_directory_creation(self):
-        """Test that output directory is created if it doesn't exist."""
-        with tempfile.TemporaryDirectory() as tmpdir:
-            nested_path = Path(tmpdir) / "nested" / "dir" / "output.csv"
-
-            with patch(
-                "oellm.interactive_csv_builder.questionary.select"
-            ) as mock_select, patch(
-                "oellm.interactive_csv_builder.questionary.text"
-            ) as mock_text, patch(
-                "oellm.interactive_csv_builder.questionary.confirm"
-            ) as mock_confirm:
-                mock_select.return_value.ask.side_effect = [
-                    "➕ Add a model",
-                    "✅ Continue to tasks",
-                    "➕ Add a single task",
-                    "0 (zero-shot)",
-                    "✅ Continue to preview",
-                ]
-
-                mock_text.return_value.ask.side_effect = [
-                    "test-model",
-                    "test-task",
-                ]
-
-                mock_confirm.return_value.ask.return_value = True
-
-                build_csv_interactive(str(nested_path))
-
-                # Check that directory was created
-                assert nested_path.parent.exists()
-                assert nested_path.exists()
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.checkbox")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    @patch("pathlib.Path.exists")
-    @patch("builtins.open", new_callable=mock_open)
-    def test_single_task_group_selection(
-        self,
-        mock_file,
-        mock_exists,
-        mock_confirm,
-        mock_checkbox,
-        mock_select,
-        temp_output_path,
-    ):
-        """Test selecting a single task group."""
-        # Mock YAML content
-        yaml_content = {
-            "task_groups": {
-                "open-sci-default": {
-                    "description": "Default OpenEuroLLM scientific tasks",
-                    "tasks": [
-                        {"task": "copa", "n_shots": [0]},
-                        {"task": "openbookqa", "n_shots": [0]},
-                        {"task": "mmlu", "n_shots": [5]},
-                    ],
-                }
-            }
-        }
-        mock_file.return_value.read.return_value = yaml.dump(yaml_content)
-        mock_exists.return_value = True
-
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "📦 Use a default task group",
-            "✅ Continue to preview",  # After adding task groups (line 201-208)
-        ]
-
-        mock_checkbox.return_value.ask.return_value = [
-            "open-sci-default - Default OpenEuroLLM scientific tasks"
-        ]
-
-        # Mock text input for model
-        with patch("oellm.interactive_csv_builder.questionary.text") as mock_text:
-            mock_text.return_value.ask.return_value = "test-model"
-            mock_confirm.return_value.ask.return_value = True
-
-            build_csv_interactive(temp_output_path)
-
-        # Verify CSV was created with correct content
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 3  # 3 tasks from the group
-        assert set(df["task_path"]) == {"copa", "openbookqa", "mmlu"}
-        assert df[df["task_path"] == "copa"]["n_shot"].values[0] == 0
-        assert df[df["task_path"] == "mmlu"]["n_shot"].values[0] == 5
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.checkbox")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    @patch("pathlib.Path.exists")
-    @patch("builtins.open", new_callable=mock_open)
-    def test_multiple_task_groups_selection(
-        self,
-        mock_file,
-        mock_exists,
-        mock_confirm,
-        mock_checkbox,
-        mock_select,
-        temp_output_path,
-    ):
-        """Test selecting multiple task groups."""
-        # Mock YAML content with multiple groups
-        yaml_content = {
-            "task_groups": {
-                "group1": {
-                    "description": "First group",
-                    "tasks": [
-                        {"task": "task1", "n_shots": [0]},
-                        {"task": "task2", "n_shots": [5]},
-                    ],
-                },
-                "group2": {
-                    "description": "Second group",
-                    "tasks": [
-                        {"task": "task3", "n_shots": [0, 5]},
-                        {"task": "task4", "n_shots": [10]},
-                    ],
-                },
-            }
-        }
-        mock_file.return_value.read.return_value = yaml.dump(yaml_content)
-        mock_exists.return_value = True
-
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "📦 Use a default task group",
-            "✅ Continue to preview",  # After adding task groups (line 201-208)
-        ]
-
-        mock_checkbox.return_value.ask.return_value = [
-            "group1 - First group",
-            "group2 - Second group",
-        ]
-
-        # Mock text input for model
-        with patch("oellm.interactive_csv_builder.questionary.text") as mock_text:
-            mock_text.return_value.ask.return_value = "test-model"
-            mock_confirm.return_value.ask.return_value = True
-
-            build_csv_interactive(temp_output_path)
-
-        # Verify CSV was created with correct content
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 5  # 2 + 3 (task3 has 2 n_shots)
-        assert set(df["task_path"]) == {"task1", "task2", "task3", "task4"}
-
-        # Check n_shot values
-        assert df[df["task_path"] == "task1"]["n_shot"].values[0] == 0
-        assert df[df["task_path"] == "task2"]["n_shot"].values[0] == 5
-        assert set(df[df["task_path"] == "task3"]["n_shot"].values) == {0, 5}
-        assert df[df["task_path"] == "task4"]["n_shot"].values[0] == 10
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.checkbox")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    @patch("pathlib.Path.exists")
-    @patch("builtins.open", new_callable=mock_open)
-    def test_task_group_no_selection(
-        self,
-        mock_file,
-        mock_exists,
-        mock_confirm,
-        mock_text,
-        mock_checkbox,
-        mock_select,
-        temp_output_path,
-    ):
-        """Test when user opens task group menu but doesn't select any."""
-        # Mock YAML content
-        yaml_content = {
-            "task_groups": {
-                "group1": {
-                    "description": "Test group",
-                    "tasks": [{"task": "task1", "n_shots": [0]}],
-                }
-            }
-        }
-        mock_file.return_value.read.return_value = yaml.dump(yaml_content)
-        mock_exists.return_value = True
-
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "📦 Use a default task group",
-            "➕ Add a single task",  # Continue to add single task after no selection
-            "0 (zero-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_checkbox.return_value.ask.return_value = []  # No groups selected
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "manual-task",
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        # Verify CSV only contains the manually added task
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 1
-        assert df["task_path"].values[0] == "manual-task"
-        assert df["n_shot"].values[0] == 0
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    @patch("pathlib.Path.exists")
-    def test_task_group_yaml_not_found(
-        self, mock_exists, mock_confirm, mock_text, mock_select, temp_output_path
-    ):
-        """Test behavior when task-groups.yaml file doesn't exist."""
-        # Mock that the YAML file doesn't exist
-        mock_exists.return_value = False
-
-        # Mock user interactions - no task group option should appear
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "➕ Add a single task",  # No task group option available
-            "0 (zero-shot)",
-            "✅ Continue to preview",
-        ]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "test-task",
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        # Verify CSV was created with manually added task
-        df = pd.read_csv(temp_output_path)
-        assert len(df) == 1
-        assert df["model_path"].values[0] == "test-model"
-        assert df["task_path"].values[0] == "test-task"
-        assert df["n_shot"].values[0] == 0
-
-    @patch("oellm.interactive_csv_builder.questionary.select")
-    @patch("oellm.interactive_csv_builder.questionary.checkbox")
-    @patch("oellm.interactive_csv_builder.questionary.text")
-    @patch("oellm.interactive_csv_builder.questionary.confirm")
-    @patch("pathlib.Path.exists")
-    @patch("builtins.open", new_callable=mock_open)
-    def test_task_group_combined_with_individual_tasks(
-        self,
-        mock_file,
-        mock_exists,
-        mock_confirm,
-        mock_text,
-        mock_checkbox,
-        mock_select,
-        temp_output_path,
-    ):
-        """Test combining task groups with individually added tasks."""
-        # Mock YAML content
-        yaml_content = {
-            "task_groups": {
-                "small-group": {
-                    "description": "Small test group",
-                    "tasks": [
-                        {"task": "group-task1", "n_shots": [0]},
-                        {"task": "group-task2", "n_shots": [5]},
-                    ],
-                }
-            }
-        }
-        mock_file.return_value.read.return_value = yaml.dump(yaml_content)
-        mock_exists.return_value = True
-
-        # Mock user interactions
-        mock_select.return_value.ask.side_effect = [
-            "➕ Add a model",
-            "✅ Continue to tasks",
-            "📦 Use a default task group",
-            "➕ Add more tasks",  # Choose to add more after task group
-            "➕ Add a single task",
-            "0,5 (both)",
-            "➕ Add a single task",
-            "📝 Custom values",
-            "✅ Continue to preview",
-        ]
-
-        mock_checkbox.return_value.ask.return_value = ["small-group - Small test group"]
-
-        mock_text.return_value.ask.side_effect = [
-            "test-model",
-            "individual-task1",
-            "individual-task2",
-            "0,10,25",  # Custom n_shot values
-        ]
-
-        mock_confirm.return_value.ask.return_value = True
-
-        build_csv_interactive(temp_output_path)
-
-        # Verify CSV contains both group tasks and individual tasks
-        df = pd.read_csv(temp_output_path)
-
-        # Should have: 2 group tasks + 2 individual-task1 n_shots + 3 individual-task2 n_shots = 7
-        assert len(df) == 7
-
-        # Check all tasks are present
-        assert set(df["task_path"]) == {
-            "group-task1",
-            "group-task2",
-            "individual-task1",
-            "individual-task2",
-        }
-
-        # Verify n_shot values for each task
-        assert df[df["task_path"] == "group-task1"]["n_shot"].values[0] == 0
-        assert df[df["task_path"] == "group-task2"]["n_shot"].values[0] == 5
-        assert set(df[df["task_path"] == "individual-task1"]["n_shot"].values) == {0, 5}
-        assert set(df[df["task_path"] == "individual-task2"]["n_shot"].values) == {
-            0,
-            10,
-            25,
-        }

From 1a0b16ac4d2e4783ac506477df455ccb66cea06b Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Sun, 19 Oct 2025 22:25:15 +0300
Subject: [PATCH 04/39] fix: lighteval integration

---
 oellm/main.py          | 633 +++++++++++++++++------------------------
 oellm/task-groups.yaml | 123 +++++---
 oellm/template.sbatch  |  11 +-
 pyproject.toml         |   6 +-
 4 files changed, 351 insertions(+), 422 deletions(-)

diff --git a/oellm/main.py b/oellm/main.py
index 94c82ad..72679e8 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -19,7 +19,7 @@
 from rich.logging import RichHandler
 
 
-def ensure_singularity_image(image_name: str) -> None:
+def _ensure_singularity_image(image_name: str) -> None:
     # TODO: switch to OELLM dataset repo once it is created
     from huggingface_hub import hf_hub_download
 
@@ -379,9 +379,131 @@ def _pre_download_task_datasets(
         logging.debug(f"Finished dataset preparation for task '{task_name}'.")
 
 
+def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
+    """Pre-download LightEval datasets by instantiating tasks via the local LightEval Registry."""
+    import sys
+
+    local_le_src = Path(__file__).parent.parent / "lighteval" / "src"
+    if local_le_src.exists():
+        sys.path.insert(0, str(local_le_src))
+
+    from lighteval.tasks.registry import Registry, TRUNCATE_FEW_SHOTS_DEFAULTS  # type: ignore
+    from lighteval.tasks.lighteval_task import LightevalTask  # type: ignore
+
+    file_task_specs: list[str] = []
+    string_task_specs: list[str] = []
+
+    for t in tasks:
+        raw = str(t).strip()
+        if not raw:
+            continue
+        candidate = Path(raw)
+        if candidate.exists() and candidate.is_file():
+            file_task_specs.append(str(candidate))
+        else:
+            spec = raw
+            truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
+            if "|" not in spec:
+                spec = f"lighteval|{spec}|0|{truncate_default}"
+            elif spec.count("|") == 1:
+                spec = f"{spec}|0|{truncate_default}"
+            elif spec.count("|") == 2:
+                spec = f"{spec}|{truncate_default}"
+            string_task_specs.append(spec)
+
+    unique_string_specs = sorted(set(string_task_specs))
+    unique_file_specs = sorted(set(file_task_specs))
+
+    if unique_string_specs:
+        reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
+        configs = reg.get_tasks_configs(",".join(unique_string_specs))
+        task_dict = reg.get_tasks_from_configs(configs)
+        LightevalTask.load_datasets(task_dict)
+
+    for fp in unique_file_specs:
+        reg_file = Registry()
+        configs_file = reg_file.get_tasks_configs(fp)
+        task_dict_file = reg_file.get_tasks_from_configs(configs_file)
+        LightevalTask.load_datasets(task_dict_file)
+
+def _load_task_groups() -> dict[str, dict]:
+    """Load task groups from `task-groups.yaml` located next to this module."""
+    groups_file = Path(__file__).parent / "task-groups.yaml"
+    if not groups_file.exists():
+        raise ValueError(f"Task groups file not found: {groups_file}")
+
+    with open(groups_file) as f:
+        data = yaml.safe_load(f) or {}
+
+    groups = data.get("task_groups") or {}
+    if not isinstance(groups, dict):
+        raise ValueError("Invalid task groups format in task-groups.yaml")
+
+    return groups
+
+
+def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]:
+    """
+    Expand task group names into concrete (task, n_shots, suite) tuples.
+
+    Supports nested groups. Defaults: suite=lm_eval, n_shots=[0] when absent.
+    A group's `suite` (if present) is inherited by its items and nested groups
+    unless a leaf explicitly overrides it.
+    """
+    groups = _load_task_groups()
+    resolved: list[tuple[str, list[int], str]] = []
+
+    def expand_group(group_name: str, stack: set[str], inherited_suite: str | None = None) -> None:
+        if group_name not in groups:
+            raise ValueError(f"Unknown task group: {group_name}")
+        if group_name in stack:
+            raise ValueError(f"Cyclic task group reference detected at '{group_name}'")
+
+        stack.add(group_name)
+        group_default_suite = groups[group_name].get("suite")
+        effective_inherited_suite = inherited_suite if inherited_suite is not None else group_default_suite
+
+        for item in groups[group_name].get("tasks", []):
+            task_identifier = str(item.get("task"))
+            # Prefer explicit suite on the item; otherwise inherit; otherwise default to lm_eval
+            item_suite = item.get("suite")
+            suite_name = (
+                str(item_suite)
+                if item_suite is not None
+                else (str(effective_inherited_suite) if effective_inherited_suite is not None else "lm_eval")
+            )
+            n_shots_value = item.get("n_shots")
+
+            # Nested group reference: propagate the resolved suite
+            if task_identifier in groups:
+                next_inherited = str(item_suite) if item_suite is not None else effective_inherited_suite
+                # Pass down only an inherited suite (or explicit item override) without defaulting to "lm_eval",
+                # so that the child group's own default `suite` can take effect if present.
+                expand_group(task_identifier, stack, next_inherited)
+                continue
+
+            # Leaf task
+            if not isinstance(n_shots_value, list):
+                n_shots: list[int] = [0]
+            else:
+                # Ensure ints
+                n_shots = [int(x) for x in n_shots_value]
+
+            resolved.append((task_identifier, n_shots, suite_name))
+        stack.remove(group_name)
+
+    for raw_name in group_names:
+        name = str(raw_name).strip()
+        if not name:
+            continue
+        expand_group(name, set(), None)
+
+    return resolved
+
 def schedule_evals(
     models: str | None = None,
     tasks: str | None = None,
+    task_groups: str | None = None,
     n_shot: int | list[int] | None = None,
     eval_csv_path: str | None = None,
     *,
@@ -405,10 +527,13 @@ def schedule_evals(
               all models in subdirectories will be automatically discovered
             - For each model directory, if it has an `hf/iter_XXXXX` structure, all checkpoints will be expanded
             - This allows passing a single directory containing multiple models to evaluate them all
-        tasks: A string of comma-separated task paths.
-        n_shot: An integer or list of integers specifying the number of shots for each task.
+        tasks: A string of comma-separated task names (lm_eval) or paths.
+            Requires `n_shot` to be provided. Tasks here are assumed to be lm_eval unless otherwise handled via CSV.
+        task_groups: A string of comma-separated task group names defined in `task-groups.yaml`.
+            Each group expands into concrete (task, n_shots, suite) entries; `n_shot` is ignored for groups.
+        n_shot: An integer or list of integers specifying the number of shots applied to `tasks`.
         eval_csv_path: A path to a CSV file containing evaluation data.
-            Warning: exclusive argument. Cannot specify `models`, `tasks`, or `n_shot` when `eval_csv_path` is provided.
+            Warning: exclusive argument. Cannot specify `models`, `tasks`, `task_groups`, or `n_shot` when `eval_csv_path` is provided.
         max_array_len: The maximum number of jobs to schedule to run concurrently.
             Warning: this is not the number of jobs in the array job. This is determined by the environment variable `QUEUE_LIMIT`.
         download_only: If True, only download the datasets and models and exit.
@@ -428,14 +553,15 @@ def schedule_evals(
                 "EVAL_CONTAINER_IMAGE is not set. Please set it in clusters.yaml."
             )
 
-        ensure_singularity_image(image_name)
+        _ensure_singularity_image(image_name)
     else:
         logging.info("Skipping container image check (--skip-checks enabled)")
 
+
     if eval_csv_path:
-        if models or tasks or n_shot:
+        if models or tasks or task_groups or n_shot:
             raise ValueError(
-                "Cannot specify `models`, `tasks`, or `n_shot` when `eval_csv_path` is provided."
+                "Cannot specify `models`, `tasks`, `task_groups`, or `n_shot` when `eval_csv_path` is provided."
             )
         df = pd.read_csv(eval_csv_path)
         required_cols = {"model_path", "task_path", "n_shot"}
@@ -484,10 +610,9 @@ def schedule_evals(
             logging.info(
                 "Skipping model path processing and validation (--skip-checks enabled)"
             )
-
-    elif models and tasks and n_shot is not None:
-        model_list = models.split(",")
-        model_paths = []
+    elif models and ((tasks and n_shot is not None) or task_groups):
+        model_list = [m.strip() for m in models.split(",") if m.strip()]
+        model_paths: list[Path | str] = []
 
         # Always expand local paths
         for model in model_list:
@@ -512,21 +637,50 @@ def schedule_evals(
                 "Skipping model path processing and validation (--skip-checks enabled)"
             )
 
-        tasks_list = tasks.split(",")
+        rows: list[dict[str, Path | str | int]] = []
 
-        # cross product of model_paths and tasks into a dataframe
-        df = pd.DataFrame(
-            product(
-                model_paths,
-                tasks_list,
-                n_shot if isinstance(n_shot, list) else [n_shot],
-            ),
-            columns=["model_path", "task_path", "n_shot"],
-        )
-        df["eval_suite"] = "lm_eval"
+        # Handle explicit tasks (lm_eval) with provided n_shot
+        if tasks:
+            if n_shot is None:
+                raise ValueError(
+                    "When specifying `tasks`, you must also provide `n_shot`. For task groups, use `task_groups`."
+                )
+            tasks_list = [t.strip() for t in tasks.split(",") if t.strip()]
+            shots: list[int]
+            shots = n_shot if isinstance(n_shot, list) else [int(n_shot)]
+            for model_path in model_paths:
+                for task_name in tasks_list:
+                    for s in shots:
+                        rows.append(
+                            {
+                                "model_path": model_path,
+                                "task_path": task_name,
+                                "n_shot": int(s),
+                                "eval_suite": "lm_eval",
+                            }
+                        )
+
+        # Handle task groups
+        if task_groups:
+            group_names = [g.strip() for g in task_groups.split(",") if g.strip()]
+            # import pdb; pdb.set_trace()
+            expanded = _expand_task_groups(group_names)
+            for model_path in model_paths:
+                for task_name, n_shots, suite_name in expanded:
+                    for s in n_shots:
+                        rows.append(
+                            {
+                                "model_path": model_path,
+                                "task_path": task_name,
+                                "n_shot": int(s),
+                                "eval_suite": suite_name,
+                            }
+                        )
+
+        df = pd.DataFrame(rows, columns=["model_path", "task_path", "n_shot", "eval_suite"])
     else:
         raise ValueError(
-            "Either `eval_csv_path` must be provided, or all of `models`, `tasks`, and `n_shot`."
+            "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`."
         )
 
     if df.empty:
@@ -543,6 +697,12 @@ def schedule_evals(
             _pre_download_task_datasets(
                 lm_eval_tasks, trust_remote_code=trust_remote_code
             )
+        # Pre-download LightEval datasets (best-effort, incremental support)
+        light_eval_tasks = df[
+            df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
+        ]["task_path"].unique()
+        if len(light_eval_tasks) > 0:
+            _pre_download_lighteval_datasets(light_eval_tasks)
     else:
         logging.info("Skipping dataset pre-download (--skip-checks enabled)")
 
@@ -800,7 +960,6 @@ def collect_results(
     output_csv: str = "eval_results.csv",
     *,
     check: bool = False,
-    reschedule: bool = False,
     verbose: bool = False,
 ) -> None:
     """
@@ -809,16 +968,12 @@ def collect_results(
     Args:
         results_dir: Path to the directory containing result JSON files
         output_csv: Output CSV filename (default: eval_results.csv)
-        check: Check for crashed or pending evaluations
-        reschedule: Show overview table and prompt to reschedule failed/pending jobs
+        check: Check for missing evaluations and create a missing jobs CSV
         verbose: Enable verbose logging
     """
     import json
 
-    from rich.table import Table
-
     _setup_logging(verbose)
-    console = Console()
 
     results_path = Path(results_dir)
     if not results_path.exists():
@@ -839,13 +994,12 @@ def collect_results(
 
     logging.info(f"Found {len(json_files)} result files")
 
-    # If check or reschedule mode, also load the jobs.csv to compare
-    if check or reschedule:
+    # If check mode, also load the jobs.csv to compare
+    if check:
         jobs_csv_path = results_path / "jobs.csv"
         if not jobs_csv_path.exists():
             logging.warning(f"No jobs.csv found in {results_dir}, cannot perform check")
             check = False
-            reschedule = False
         else:
             jobs_df = pd.read_csv(jobs_csv_path)
             logging.info(f"Found {len(jobs_df)} scheduled jobs in jobs.csv")
@@ -853,72 +1007,62 @@ def collect_results(
     # Collect results
     rows = []
     completed_jobs = set()  # Track (model, task, n_shot) tuples
-    results_with_performance = (
-        0  # Track how many results actually have performance data
-    )
 
     for json_file in json_files:
-        try:
-            with open(json_file) as f:
-                data = json.load(f)
-
-            # Extract model name/path
-            model_name = data.get("model_name", "unknown")
-
-            # Extract results for each task
-            results = data.get("results", {})
-            n_shot_data = data.get("n-shot", {})
-
-            for task_name, task_results in results.items():
-                # Skip MMLU subtasks - only keep the aggregate score
-                if task_name.startswith("mmlu_") and task_name != "mmlu":
-                    continue
-
-                # Get n_shot for this task
-                n_shot = n_shot_data.get(task_name, "unknown")
-
-                # Special handling for MMLU aggregate - get n_shot from any MMLU subtask
-                if task_name == "mmlu" and n_shot == "unknown":
-                    for key, value in n_shot_data.items():
-                        if key.startswith("mmlu_"):
-                            n_shot = value
-                            break
-
-                # Get the primary metric (usually acc,none)
-                performance = task_results.get("acc,none")
-                if performance is None:
-                    # Try other common metric names
-                    for metric in ["acc", "accuracy", "f1", "exact_match"]:
-                        if metric in task_results:
-                            performance = task_results[metric]
-                            break
-
-                if performance is not None:
-                    results_with_performance += 1
-
-                    # Track completed job for check/reschedule mode (only if we have a result)
-                    if check or reschedule:
-                        completed_jobs.add((model_name, task_name, n_shot))
-
-                    rows.append(
-                        {
-                            "model_name": model_name,
-                            "task": task_name,
-                            "n_shot": n_shot,
-                            "performance": performance,
-                        }
+        with open(json_file) as f:
+            data = json.load(f)
+
+        # Extract model name/path
+        model_name = data.get("model_name", "unknown")
+
+        # Extract results for each task
+        results = data.get("results", {})
+        n_shot_data = data.get("n-shot", {})
+
+        for task_name, task_results in results.items():
+            # Skip MMLU subtasks - only keep the aggregate score
+            if task_name.startswith("mmlu_") and task_name != "mmlu":
+                continue
+
+            # Get n_shot for this task
+            n_shot = n_shot_data.get(task_name, "unknown")
+
+            # Special handling for MMLU aggregate - get n_shot from any MMLU subtask
+            if task_name == "mmlu" and n_shot == "unknown":
+                for key, value in n_shot_data.items():
+                    if key.startswith("mmlu_"):
+                        n_shot = value
+                        break
+
+            # Get the primary metric (usually acc,none)
+            performance = task_results.get("acc,none")
+            if performance is None:
+                # Try other common metric names
+                for metric in ["acc", "accuracy", "f1", "exact_match"]:
+                    if metric in task_results:
+                        performance = task_results[metric]
+                        break
+
+            if performance is not None:
+                # Track completed job for check mode
+                if check:
+                    completed_jobs.add((model_name, task_name, n_shot))
+
+                rows.append(
+                    {
+                        "model_name": model_name,
+                        "task": task_name,
+                        "n_shot": n_shot,
+                        "performance": performance,
+                    }
+                )
+            else:
+                # Debug: log cases where we have a task but no performance metric
+                if verbose:
+                    logging.debug(
+                        f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}"
                     )
-                else:
-                    # Debug: log cases where we have a task but no performance metric
-                    if verbose:
-                        logging.debug(
-                            f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}"
-                        )
 
-        except Exception as e:
-            logging.warning(f"Failed to process {json_file}: {e}")
-            if verbose:
-                logging.exception(e)
 
     if not rows and not check:
         logging.warning("No results extracted from JSON files")
@@ -941,101 +1085,23 @@ def collect_results(
             )
 
     # Perform check analysis if requested
-    if check or reschedule:
+    if check:
         logging.info("\n=== Evaluation Status Check ===")
 
-        # Parse SLURM logs to get more detailed status
-        slurm_logs_dir = results_path / "slurm_logs"
-        attempted_jobs = set()  # Jobs that were attempted (started)
-        failed_jobs = set()  # Jobs that crashed/failed
-
-        if slurm_logs_dir.exists():
-            # Parse .out files to find attempted jobs
-            for out_file in slurm_logs_dir.glob("*.out"):
-                try:
-                    with open(out_file) as f:
-                        content = f.read()
-                        # Look for "Starting evaluation for:" patterns
-                        import re
-
-                        pattern = r"Starting evaluation for:\s*\n\s*Model: (.+)\s*\n\s*Task: (.+)\s*\n\s*N-shot: (\d+)"
-                        matches = re.findall(pattern, content)
-                        for model, task, n_shot in matches:
-                            attempted_jobs.add(
-                                (model.strip(), task.strip(), int(n_shot.strip()))
-                            )
-
-                        # Check if job finished successfully
-                        if "Job" in content and "finished." in content:
-                            # This array job completed successfully
-                            pass
-                        else:
-                            # Job might have crashed - check for specific patterns
-                            if (
-                                "Traceback" in content
-                                or "Error" in content
-                                or "Exception" in content
-                            ):
-                                for model, task, n_shot in matches:
-                                    failed_jobs.add(
-                                        (
-                                            model.strip(),
-                                            task.strip(),
-                                            int(n_shot.strip()),
-                                        )
-                                    )
-                except Exception as e:
-                    logging.debug(f"Error parsing {out_file}: {e}")
-
-            # Parse .err files for errors
-            for err_file in slurm_logs_dir.glob("*.err"):
-                try:
-                    file_size = err_file.stat().st_size
-                    if file_size > 0:  # Non-empty error file
-                        # Extract array task ID from filename
-                        array_id_match = re.search(r"-(\d+)\.err$", err_file.name)
-                        if array_id_match:
-                            int(array_id_match.group(1))
-                            # Find corresponding .out file to get job details
-                            out_file = err_file.with_suffix(".out")
-                            if out_file.exists():
-                                with open(out_file) as f:
-                                    content = f.read()
-                                    pattern = r"Starting evaluation for:\s*\n\s*Model: (.+)\s*\n\s*Task: (.+)\s*\n\s*N-shot: (\d+)"
-                                    matches = re.findall(pattern, content)
-                                    for model, task, n_shot in matches:
-                                        failed_jobs.add(
-                                            (
-                                                model.strip(),
-                                                task.strip(),
-                                                int(n_shot.strip()),
-                                            )
-                                        )
-                except Exception as e:
-                    logging.debug(f"Error parsing {err_file}: {e}")
-
-        # Categorize incomplete jobs
-        still_running_jobs = []  # Jobs that are likely still executing
-        never_attempted_jobs = []
-        crashed_jobs = []
-        needs_rerun_jobs = []  # Jobs that definitely need to be rescheduled
-
-        # We know we have exactly len(completed_jobs) completed jobs with actual results
-        # The rest need to be categorized
-        len(completed_jobs)
+        # Find missing jobs
+        missing_jobs = []
 
         for _, job in jobs_df.iterrows():
             job_tuple = (job["model_path"], job["task_path"], job["n_shot"])
 
             # Check if this job corresponds to one of our completed results
-            # Use the same matching logic as before but don't over-count
             is_completed = False
 
-            # Try to find a matching completed job
+            # Try exact matching first
             if job_tuple in completed_jobs:
                 is_completed = True
             else:
-                # Try fuzzy matching
+                # Try fuzzy matching for model names
                 for completed_job in completed_jobs:
                     completed_model, completed_task, completed_n_shot = completed_job
 
@@ -1050,206 +1116,33 @@ def collect_results(
                         is_completed = True
                         break
 
-            if is_completed:
-                continue  # Skip completed jobs
-
-            # Job is not completed, categorize it
-            if job_tuple in failed_jobs:
-                crashed_jobs.append(job)
-                needs_rerun_jobs.append(job)
-            elif job_tuple not in attempted_jobs:
-                never_attempted_jobs.append(job)
-                needs_rerun_jobs.append(job)  # These likely need rescheduling too
-            else:
-                # Job was attempted but not completed and didn't crash - likely still running
-                still_running_jobs.append(job)
-
-        needs_rerun_df = pd.DataFrame(needs_rerun_jobs)
+            if not is_completed:
+                missing_jobs.append(job)
 
-        # Calculate completed jobs based on the jobs.csv perspective
-        actual_completed_from_jobs = (
-            len(jobs_df)
-            - len(still_running_jobs)
-            - len(crashed_jobs)
-            - len(never_attempted_jobs)
-        )
+        completed_count = len(jobs_df) - len(missing_jobs)
 
         logging.info(f"\nTotal scheduled jobs: {len(jobs_df)}")
-        logging.info(
-            f"Completed jobs (from scheduled jobs): {actual_completed_from_jobs}"
-        )
-        logging.info(f"Still running/pending: {len(still_running_jobs)}")
-        logging.info(f"Failed/Crashed jobs: {len(crashed_jobs)}")
-        logging.info(f"Never attempted: {len(never_attempted_jobs)}")
-        logging.info(f"Jobs needing reschedule: {len(needs_rerun_jobs)}")
-
-        if verbose:
-            logging.info(f"Total CSV rows (results with performance data): {len(rows)}")
+        logging.info(f"Completed jobs: {completed_count}")
+        logging.info(f"Missing jobs: {len(missing_jobs)}")
+
+        if len(missing_jobs) > 0:
+            missing_df = pd.DataFrame(missing_jobs)
+            missing_csv = output_csv.replace(".csv", "_missing.csv")
+            missing_df.to_csv(missing_csv, index=False)
+            logging.info(f"\nMissing jobs saved to: {missing_csv}")
             logging.info(
-                f"Unique completed jobs found in JSON files: {len(completed_jobs)}"
+                f"You can run these with: oellm schedule-eval --eval_csv_path {missing_csv}"
             )
-            if len(completed_jobs) != actual_completed_from_jobs:
-                logging.info(
-                    f"Note: {len(completed_jobs)} results found vs {actual_completed_from_jobs} jobs matched from schedule"
-                )
-
-        if len(needs_rerun_jobs) > 0:
-            if reschedule:
-                # Show overview table in reschedule mode
-                console.print("\n[bold cyan]🔄 Jobs Needing Reschedule[/bold cyan]")
-
-                # Create summary table
-                summary_table = Table(
-                    show_header=True, header_style="bold magenta", box=box.ROUNDED
-                )
-                summary_table.add_column("Status", style="bold")
-                summary_table.add_column("Count", justify="right", style="cyan")
-
-                summary_table.add_row("✅ Completed", str(actual_completed_from_jobs))
-                summary_table.add_row("🏃 Still Running", str(len(still_running_jobs)))
-                summary_table.add_row("❌ Crashed", str(len(crashed_jobs)))
-                summary_table.add_row(
-                    "⏭️  Never Attempted", str(len(never_attempted_jobs))
-                )
-                summary_table.add_row(
-                    "[bold yellow]🔄 Need Reschedule[/bold yellow]",
-                    f"[bold yellow]{len(needs_rerun_jobs)}[/bold yellow]",
-                )
-
-                console.print(summary_table)
-
-                # Show detailed table of jobs to reschedule
-                console.print("\n[bold cyan]📋 Detailed Job List[/bold cyan]")
 
-                detail_table = Table(
-                    show_header=True, header_style="bold magenta", box=box.ROUNDED
-                )
-                detail_table.add_column("#", style="dim", width=4)
-                detail_table.add_column("Status", style="bold", width=15)
-                detail_table.add_column(
-                    "Model", style="cyan", no_wrap=True, max_width=40
-                )
-                detail_table.add_column("Task", style="green", max_width=20)
-                detail_table.add_column("n_shot", justify="right", style="yellow")
-
-                # Show first 20 rows
-                for idx, (_, job) in enumerate(needs_rerun_df.head(20).iterrows(), 1):
-                    if (
-                        job["model_path"],
-                        job["task_path"],
-                        job["n_shot"],
-                    ) in failed_jobs:
-                        status = "[red]❌ CRASHED[/red]"
-                    else:
-                        status = "[yellow]⏭️  NOT ATTEMPTED[/yellow]"
-
-                    # Truncate long model paths for display
-                    model_display = str(job["model_path"])
-                    if len(model_display) > 40:
-                        model_display = "..." + model_display[-37:]
-
-                    detail_table.add_row(
-                        str(idx),
-                        status,
-                        model_display,
-                        str(job["task_path"]),
-                        str(job["n_shot"]),
-                    )
-
-                if len(needs_rerun_jobs) > 20:
-                    detail_table.add_row("...", "...", "...", "...", "...")
-                    console.print(detail_table)
-                    console.print(
-                        f"\n[dim]Showing 20 of {len(needs_rerun_jobs)} jobs[/dim]"
+            # Show some examples if verbose
+            if verbose and len(missing_jobs) > 0:
+                logging.info("\nExample missing jobs:")
+                for _i, (_, job) in enumerate(missing_df.head(5).iterrows()):
+                    logging.info(
+                        f"  - {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}"
                     )
-                else:
-                    console.print(detail_table)
-
-                # Ask for confirmation
-                console.print(
-                    f"\n[bold]Total jobs to reschedule: {len(needs_rerun_jobs)}[/bold]"
-                )
-
-                import questionary
-                from questionary import Style
-
-                custom_style = Style(
-                    [
-                        ("qmark", "fg:#673ab7 bold"),
-                        ("question", "bold"),
-                        ("answer", "fg:#f44336 bold"),
-                        ("pointer", "fg:#673ab7 bold"),
-                        ("highlighted", "fg:#673ab7 bold"),
-                        ("selected", "fg:#cc5454"),
-                    ]
-                )
-
-                save_and_schedule = questionary.confirm(
-                    "\nSave failed jobs CSV and schedule re-evaluation?",
-                    default=True,
-                    style=custom_style,
-                ).ask()
-
-                if save_and_schedule:
-                    # Save the CSV
-                    rerun_csv = output_csv.replace(".csv", "_needs_rerun.csv")
-                    needs_rerun_df.to_csv(rerun_csv, index=False)
-                    console.print(f"\n[green]✅ Jobs saved to: {rerun_csv}[/green]")
-
-                    # Ask if they want to schedule now
-                    schedule_now = questionary.confirm(
-                        "\nSchedule these jobs now?",
-                        default=True,
-                        style=custom_style,
-                    ).ask()
-
-                    if schedule_now:
-                        console.print("\n[yellow]To schedule these jobs, run:[/yellow]")
-                        console.print(
-                            f"[bold cyan]oellm schedule-eval --eval_csv_path {rerun_csv}[/bold cyan]"
-                        )
-
-            else:
-                # Original behavior for check mode
-                # Save jobs that need rescheduling
-                rerun_csv = output_csv.replace(".csv", "_needs_rerun.csv")
-                needs_rerun_df.to_csv(rerun_csv, index=False)
-                logging.info(f"\nJobs needing reschedule saved to: {rerun_csv}")
-                logging.info(
-                    f"You can re-run these with: [bold cyan]oellm schedule-eval --eval_csv_path {rerun_csv}[/bold cyan]"
-                )
-
-                # Save crashed jobs separately if any
-                if crashed_jobs:
-                    crashed_csv = output_csv.replace(".csv", "_crashed.csv")
-                    pd.DataFrame(crashed_jobs).to_csv(crashed_csv, index=False)
-                    logging.info(f"Crashed jobs specifically saved to: {crashed_csv}")
-
-                # Show some examples if verbose
-                if verbose and len(needs_rerun_jobs) > 0:
-                    logging.info("\nExample jobs needing reschedule:")
-                    for _i, (_, job) in enumerate(needs_rerun_df.head(5).iterrows()):
-                        if (
-                            job["model_path"],
-                            job["task_path"],
-                            job["n_shot"],
-                        ) in failed_jobs:
-                            status = "CRASHED"
-                        else:
-                            status = "NEVER ATTEMPTED"
-                        logging.info(
-                            f"  - [{status}] {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}"
-                        )
-                    if len(needs_rerun_jobs) > 5:
-                        logging.info(f"  ... and {len(needs_rerun_jobs) - 5} more")
-
-        if still_running_jobs and verbose:
-            logging.info(
-                f"\nNote: {len(still_running_jobs)} jobs appear to still be running/pending."
-            )
-            logging.info(
-                "These were attempted but haven't completed yet. Check SLURM queue status."
-            )
+                if len(missing_jobs) > 5:
+                    logging.info(f"  ... and {len(missing_jobs) - 5} more")
 
 
 def main():
diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml
index 177cb61..e108497 100644
--- a/oellm/task-groups.yaml
+++ b/oellm/task-groups.yaml
@@ -79,67 +79,100 @@ task_groups:
         n_shots: [5]
       - task: belebele_swe_Latn
         n_shots: [5]
-  oellm-multilingual:
-    description: "Combined Belebele EU set plus multilingual benchmarks"
+  flores-200-eu:
+    description: "Flores 200 EU tasks"
+    suite: lighteval
     tasks:
-      - task: belebele_bul_Cyrl
-        n_shots: [5]
-      - task: belebele_hrv_Latn
-        n_shots: [5]
-      - task: belebele_ces_Latn
-        n_shots: [5]
-      - task: belebele_dan_Latn
-        n_shots: [5]
-      - task: belebele_nld_Latn
-        n_shots: [5]
-      - task: belebele_eng_Latn
+      - task: flores200:bul_Cyrl-eng_Latn
+        n_shots: [0]
+      - task: flores200:ces_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:dan_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:deu_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:ell_Grek-eng_Latn
+        n_shots: [0]
+      - task: flores200:est_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:fin_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:fra_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:gle_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:hrv_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:hun_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:ita_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:lit_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:lvs_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:mlt_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:nld_Latn-eng_Latn     
+        n_shots: [0]
+      - task: flores200:pol_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:por_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:ron_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:slk_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:slv_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:spa_Latn-eng_Latn
+        n_shots: [0]
+      - task: flores200:swe_Latn-eng_Latn
+        n_shots: [0]
+  global-mmlu-eu:
+    description: "Global MMLU EU tasks"
+    tasks:
+      - task: global_mmlu_full_cs
         n_shots: [5]
-      - task: belebele_est_Latn
+      - task: global_mmlu_full_de
         n_shots: [5]
-      - task: belebele_fin_Latn
+      - task: global_mmlu_full_el
         n_shots: [5]
-      - task: belebele_fra_Latn
+      - task: global_mmlu_full_en
         n_shots: [5]
-      - task: belebele_deu_Latn
+      - task: global_mmlu_full_es
         n_shots: [5]
-      - task: belebele_ell_Grek
+      - task: global_mmlu_full_fr
         n_shots: [5]
-      - task: belebele_hun_Latn
+      - task: global_mmlu_full_it
         n_shots: [5]
-      - task: belebele_ita_Latn
+      - task: global_mmlu_full_lt
         n_shots: [5]
-      - task: belebele_lvs_Latn
+      - task: global_mmlu_full_nl
         n_shots: [5]
-      - task: belebele_lit_Latn
+      - task: global_mmlu_full_pl
         n_shots: [5]
-      - task: belebele_mlt_Latn
+      - task: global_mmlu_full_pt
         n_shots: [5]
-      - task: belebele_pol_Latn
+      - task: global_mmlu_full_ro
         n_shots: [5]
-      - task: belebele_por_Latn
+      - task: global_mmlu_full_ru
         n_shots: [5]
-      - task: belebele_ron_Latn
+      - task: global_mmlu_full_sr
         n_shots: [5]
-      - task: belebele_slk_Latn
+      - task: global_mmlu_full_sv
         n_shots: [5]
-      - task: belebele_slv_Latn
+      - task: global_mmlu_full_tr
         n_shots: [5]
-      - task: belebele_spa_Latn
+      - task: global_mmlu_full_uk
         n_shots: [5]
-      - task: belebele_swe_Latn
+      - task: global_mmlu_full_he
         n_shots: [5]
-      - task: xwinograd
-        n_shots: [0]
-      - task: xcopa
-        n_shots: [0]
-      - task: xstorycloze
-        n_shots: [0]
-      - task: global_mmlu
-        n_shots: [0]
-        suite: lm_eval
-      - task: light_eval_benchmarks/flores-200-eu.txt
-        n_shots: [0]
-        suite: lighteval
-      - task: include
-        n_shots: [0]
-        suite: lm_eval
+  oellm-multilingual:
+    description: "Combined Belebele EU set plus multilingual benchmarks"
+    tasks:
+      # - task: belebele-eu-5-shot
+      #   suite: lm_eval
+      - task: flores-200-eu
+      # - task: global-mmlu-eu
+      #   suite: lm_eval
diff --git a/oellm/template.sbatch b/oellm/template.sbatch
index a4f9317..f4b4905 100644
--- a/oellm/template.sbatch
+++ b/oellm/template.sbatch
@@ -20,6 +20,7 @@ export HF_XET_CACHE="$HF_HOME/xet"
 export HF_ASSETS_CACHE="$HF_HOME/assets"
 export HUGGINGFACE_HUB_CACHE="$HF_HOME/hub"
 export HUGGINGFACE_ASSETS_CACHE="$HF_HOME/assets"
+export HF_DATASETS_CACHE="$HF_HOME/datasets"
 export HF_HUB_OFFLINE=1
 
 # Path to the shared Singularity image that contains all runtime deps
@@ -62,7 +63,7 @@ do
     model_path=$(echo "$model_path" | tr -d '\r')
     task_path=$(echo "$task_path" | tr -d '\r')
     n_shot=$(echo "$n_shot" | tr -d '\r')
-    eval_suite=$(echo "${eval_suite:-lm_eval}" | tr -d '\r')
+    eval_suite=$(echo "${{eval_suite:-lm_eval}}" | tr -d '\r')
 
     # Skip empty lines
     if [ -z "$model_path" ]; then
@@ -113,15 +114,15 @@ do
             if [[ -f "$LIGHT_TASK" ]]; then
                 LIGHT_TASK_ARG="$LIGHT_TASK"
             else
-                last_segment="${LIGHT_TASK##*|}"
+                last_segment="${{LIGHT_TASK##*|}}"
                 if [[ "$LIGHT_TASK" == *"|"* && "$last_segment" =~ ^[0-9]+$ ]]; then
                     if [[ -n "$n_shot" && "$last_segment" != "$n_shot" ]]; then
-                        LIGHT_TASK_ARG="${LIGHT_TASK%|*}|$n_shot"
+                        LIGHT_TASK_ARG="${{LIGHT_TASK%|*}}|$n_shot"
                     else
                         LIGHT_TASK_ARG="$LIGHT_TASK"
                     fi
                 else
-                    LIGHT_TASK_ARG="${LIGHT_TASK}|$n_shot"
+                    LIGHT_TASK_ARG="${{LIGHT_TASK}}|$n_shot"
                 fi
             fi
 
@@ -146,4 +147,4 @@ do
 
 done
 
-echo "Job $SLURM_ARRAY_TASK_ID finished."
+echo "Job $SLURM_ARRAY_TASK_ID finished."
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 262103d..be55756 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,11 +6,13 @@ readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "pandas",
-    "jsonargparse[all]",
-    "datasets<4.0.0",
+    "jsonargparse",
+    "datasets",
     "rich",
     "torch",
     "lm-eval",
+    "lighteval[extended_tasks,multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978",
+    "pydantic<2.12",
     "huggingface_hub",
     "pyyaml",
     "questionary",

From f9c5bcec47da95533bde632a8052110b9a09963d Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 10:24:16 +0300
Subject: [PATCH 05/39] fix: lumi paths

---
 apptainer/lumi.def | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/apptainer/lumi.def b/apptainer/lumi.def
index 020a0e8..a7d71d7 100644
--- a/apptainer/lumi.def
+++ b/apptainer/lumi.def
@@ -2,27 +2,28 @@ Bootstrap: docker
 From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
 
 %labels
-    Author      multi-cluster-eval
-    Description Apptainer image for LUMI cluster (converted from dockerfile)
+    Author      oellm-cli
+    Description Apptainer image for LUMI cluster
 
 %post
-    # 1. Install uv package manager
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile
+    # Install uv into a global bin
+    curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
 
-    # Make uv visible for subsequent commands during build
-    export PATH=/root/.local/bin:$PATH
+    # Put uv-installed tool shims in a global bin too
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    uv --version
 
-    # 2. Install Python dependencies
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
-    # Install LightEval CLI in an isolated environment
+    # Optional: keep tool envs under /opt to avoid $HOME
+    export UV_TOOL_DIR=/opt/uv-tools
     uv tool install "lighteval[multilingual]"
-
+    
 %environment
-    # Ensure uv is present inside the container runtime as well
-    export PATH=/root/.local/bin:$PATH
+    export PATH=/usr/local/bin:$PATH
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    export UV_TOOL_DIR=/opt/uv-tools
 
 %runscript
     exec bash "$@" 
\ No newline at end of file

From 64287d4c02235071109a516d57276b494eceb5aa Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 13:48:05 +0300
Subject: [PATCH 06/39] fix: faster compression

---
 .github/workflows/build-and-push-apptainer.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml
index b066e28..db5ed7c 100644
--- a/.github/workflows/build-and-push-apptainer.yml
+++ b/.github/workflows/build-and-push-apptainer.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Build SIF from definition file
         run: |
-          apptainer --verbose build --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
+          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 22" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
 
       - name: Install Hugging Face Hub CLI
         run: pip install --upgrade "huggingface_hub"

From 2674439d7820cfc581792528cffbcb337e1bfec6 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 14:03:21 +0300
Subject: [PATCH 07/39] fix: faster compression

---
 .github/workflows/build-and-push-apptainer.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml
index db5ed7c..197816e 100644
--- a/.github/workflows/build-and-push-apptainer.yml
+++ b/.github/workflows/build-and-push-apptainer.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Build SIF from definition file
         run: |
-          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 22" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
+          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
 
       - name: Install Hugging Face Hub CLI
         run: pip install --upgrade "huggingface_hub"

From 10d42173ad78e3c6a808a25050faef3b1ebf3d31 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 15:44:49 +0300
Subject: [PATCH 08/39] chore: remove unnecessary files

---
 oellm/light_eval_benchmarks/flores-200-eu.txt | 44 -------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 oellm/light_eval_benchmarks/flores-200-eu.txt

diff --git a/oellm/light_eval_benchmarks/flores-200-eu.txt b/oellm/light_eval_benchmarks/flores-200-eu.txt
deleted file mode 100644
index 414ad1d..0000000
--- a/oellm/light_eval_benchmarks/flores-200-eu.txt
+++ /dev/null
@@ -1,44 +0,0 @@
-flores200:bul_Cyrl-eng_Latn|0
-flores200:ces_Latn-eng_Latn|0
-flores200:dan_Latn-eng_Latn|0
-flores200:deu_Latn-eng_Latn|0
-flores200:ell_Grek-eng_Latn|0
-flores200:eng_Latn-bul_Cyrl|0
-flores200:eng_Latn-ces_Latn|0
-flores200:eng_Latn-dan_Latn|0
-flores200:eng_Latn-deu_Latn|0
-flores200:eng_Latn-ell_Grek|0
-flores200:eng_Latn-est_Latn|0
-flores200:eng_Latn-fin_Latn|0
-flores200:eng_Latn-fra_Latn|0
-flores200:eng_Latn-hrv_Latn|0
-flores200:eng_Latn-hun_Latn|0
-flores200:eng_Latn-ita_Latn|0
-flores200:eng_Latn-lit_Latn|0
-flores200:eng_Latn-lvs_Latn|0
-flores200:eng_Latn-mlt_Latn|0
-flores200:eng_Latn-nld_Latn|0
-flores200:eng_Latn-pol_Latn|0
-flores200:eng_Latn-por_Latn|0
-flores200:eng_Latn-ron_Latn|0
-flores200:eng_Latn-slk_Latn|0
-flores200:eng_Latn-slv_Latn|0
-flores200:eng_Latn-spa_Latn|0
-flores200:eng_Latn-swe_Latn|0
-flores200:est_Latn-eng_Latn|0
-flores200:fin_Latn-eng_Latn|0
-flores200:fra_Latn-eng_Latn|0
-flores200:hrv_Latn-eng_Latn|0
-flores200:hun_Latn-eng_Latn|0
-flores200:ita_Latn-eng_Latn|0
-flores200:lit_Latn-eng_Latn|0
-flores200:lvs_Latn-eng_Latn|0
-flores200:mlt_Latn-eng_Latn|0
-flores200:nld_Latn-eng_Latn|0
-flores200:pol_Latn-eng_Latn|0
-flores200:por_Latn-eng_Latn|0
-flores200:ron_Latn-eng_Latn|0
-flores200:slk_Latn-eng_Latn|0
-flores200:slv_Latn-eng_Latn|0
-flores200:spa_Latn-eng_Latn|0
-flores200:swe_Latn-eng_Latn|0

From e2c866ac002cc30d6e7b97103997df7cb407cbc1 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 15:45:21 +0300
Subject: [PATCH 09/39] fix: ruff formatting target version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index be55756..2b7a64a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,8 +44,8 @@ url = "https://download.pytorch.org/whl/cpu"
 explicit = true
 
 [tool.ruff]
-line-length = 88
-target-version = "py38"
+line-length = 90
+target-version = "py312"
 
 [tool.ruff.lint]
 select = [

From 20f04e963a2ab50afe6971a2ea3c5f5f0265b347 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 15:50:44 +0300
Subject: [PATCH 10/39] chore: restructure task-groups into groups and
 super-groups

---
 oellm/task-groups.yaml | 156 +++++++++++++++++------------------------
 1 file changed, 63 insertions(+), 93 deletions(-)

diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml
index e108497..8f91679 100644
--- a/oellm/task-groups.yaml
+++ b/oellm/task-groups.yaml
@@ -5,6 +5,7 @@
 task_groups:
   open-sci-0.01:
     description: "open-sci-ref 0.01 evals"
+    suite: lm-eval-harness
     tasks:
       - task: copa
         n_shots: [0]
@@ -32,147 +33,116 @@ task_groups:
         n_shots: [10]
   belebele-eu-5-shot:
     description: "Belebele European language tasks"
+    suite: lm-eval-harness
+    n_shots: [5]
     tasks:
       - task: belebele_bul_Cyrl
-        n_shots: [5]
       - task: belebele_hrv_Latn
-        n_shots: [5]
       - task: belebele_ces_Latn
-        n_shots: [5]
       - task: belebele_dan_Latn
-        n_shots: [5]
       - task: belebele_nld_Latn
-        n_shots: [5]
       - task: belebele_eng_Latn
-        n_shots: [5]
       - task: belebele_est_Latn
-        n_shots: [5]
       - task: belebele_fin_Latn
-        n_shots: [5]
       - task: belebele_fra_Latn
-        n_shots: [5]
       - task: belebele_deu_Latn
-        n_shots: [5]
       - task: belebele_ell_Grek
-        n_shots: [5]
       - task: belebele_hun_Latn
-        n_shots: [5]
       - task: belebele_ita_Latn
-        n_shots: [5]
       - task: belebele_lvs_Latn
-        n_shots: [5]
       - task: belebele_lit_Latn
-        n_shots: [5]
       - task: belebele_mlt_Latn
-        n_shots: [5]
       - task: belebele_pol_Latn
-        n_shots: [5]
       - task: belebele_por_Latn
-        n_shots: [5]
       - task: belebele_ron_Latn
-        n_shots: [5]
       - task: belebele_slk_Latn
-        n_shots: [5]
       - task: belebele_slv_Latn
-        n_shots: [5]
       - task: belebele_spa_Latn
-        n_shots: [5]
       - task: belebele_swe_Latn
-        n_shots: [5]
-  flores-200-eu:
-    description: "Flores 200 EU tasks"
+  flores-200-eu-to-eng:
+    description: "Flores 200 EU to English translation"
     suite: lighteval
+    n_shots: [0]
     tasks:
       - task: flores200:bul_Cyrl-eng_Latn
-        n_shots: [0]
-      - task: flores200:ces_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:dan_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:deu_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:ell_Grek-eng_Latn
-        n_shots: [0]
-      - task: flores200:est_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:fin_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:fra_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:gle_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:hrv_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:hun_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:ita_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:lit_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:lvs_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:mlt_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:nld_Latn-eng_Latn     
-        n_shots: [0]
-      - task: flores200:pol_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:por_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:ron_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:slk_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:slv_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:spa_Latn-eng_Latn
-        n_shots: [0]
-      - task: flores200:swe_Latn-eng_Latn
-        n_shots: [0]
+      # - task: flores200:ces_Latn-eng_Latn
+      # - task: flores200:dan_Latn-eng_Latn
+      # - task: flores200:deu_Latn-eng_Latn
+      # - task: flores200:ell_Grek-eng_Latn
+      # - task: flores200:est_Latn-eng_Latn
+      # - task: flores200:fin_Latn-eng_Latn
+      # - task: flores200:fra_Latn-eng_Latn
+      # - task: flores200:gle_Latn-eng_Latn
+      # - task: flores200:hrv_Latn-eng_Latn
+      # - task: flores200:hun_Latn-eng_Latn
+      # - task: flores200:ita_Latn-eng_Latn
+      # - task: flores200:lit_Latn-eng_Latn
+      # - task: flores200:lvs_Latn-eng_Latn
+      # - task: flores200:mlt_Latn-eng_Latn
+      # - task: flores200:nld_Latn-eng_Latn     
+      # - task: flores200:pol_Latn-eng_Latn
+      # - task: flores200:por_Latn-eng_Latn
+      # - task: flores200:ron_Latn-eng_Latn
+      # - task: flores200:slk_Latn-eng_Latn
+      # - task: flores200:slv_Latn-eng_Latn
+      # - task: flores200:spa_Latn-eng_Latn
+      # - task: flores200:swe_Latn-eng_Latn
+  flores-200-eng-to-eu:
+    description: "Flores 200 English to EU translation"
+    suite: lighteval
+    n_shots: [0]
+    tasks:
+      - task: flores200:eng_Latn-bul_Cyrl
+      - task: flores200:eng_Latn-ces_Latn
+      - task: flores200:eng_Latn-dan_Latn
+      - task: flores200:eng_Latn-deu_Latn
+      - task: flores200:eng_Latn-ell_Grek
+      - task: flores200:eng_Latn-est_Latn
+      - task: flores200:eng_Latn-fin_Latn
+      - task: flores200:eng_Latn-fra_Latn
+      - task: flores200:eng_Latn-gle_Latn
+      - task: flores200:eng_Latn-hrv_Latn
+      - task: flores200:eng_Latn-hun_Latn
+      - task: flores200:eng_Latn-ita_Latn
+      - task: flores200:eng_Latn-lit_Latn
+      - task: flores200:eng_Latn-lvs_Latn
+      - task: flores200:eng_Latn-mlt_Latn
+      - task: flores200:eng_Latn-nld_Latn     
+      - task: flores200:eng_Latn-pol_Latn
+      - task: flores200:eng_Latn-por_Latn
+      - task: flores200:eng_Latn-ron_Latn
+      - task: flores200:eng_Latn-slk_Latn
+      - task: flores200:eng_Latn-slv_Latn
+      - task: flores200:eng_Latn-spa_Latn
+      - task: flores200:eng_Latn-swe_Latn
   global-mmlu-eu:
     description: "Global MMLU EU tasks"
+    suite: lm-eval-harness
+    n_shots: [5]
     tasks:
       - task: global_mmlu_full_cs
-        n_shots: [5]
       - task: global_mmlu_full_de
-        n_shots: [5]
       - task: global_mmlu_full_el
-        n_shots: [5]
       - task: global_mmlu_full_en
-        n_shots: [5]
       - task: global_mmlu_full_es
-        n_shots: [5]
       - task: global_mmlu_full_fr
-        n_shots: [5]
       - task: global_mmlu_full_it
-        n_shots: [5]
       - task: global_mmlu_full_lt
-        n_shots: [5]
       - task: global_mmlu_full_nl
-        n_shots: [5]
       - task: global_mmlu_full_pl
-        n_shots: [5]
       - task: global_mmlu_full_pt
-        n_shots: [5]
       - task: global_mmlu_full_ro
-        n_shots: [5]
       - task: global_mmlu_full_ru
-        n_shots: [5]
       - task: global_mmlu_full_sr
-        n_shots: [5]
       - task: global_mmlu_full_sv
-        n_shots: [5]
       - task: global_mmlu_full_tr
-        n_shots: [5]
       - task: global_mmlu_full_uk
-        n_shots: [5]
       - task: global_mmlu_full_he
-        n_shots: [5]
+
+super_groups:
   oellm-multilingual:
     description: "Combined Belebele EU set plus multilingual benchmarks"
-    tasks:
+    task_groups:
       # - task: belebele-eu-5-shot
-      #   suite: lm_eval
-      - task: flores-200-eu
-      # - task: global-mmlu-eu
-      #   suite: lm_eval
+      - task: flores-200-eu-to-eng
+      # - task: global-mmlu-eu
\ No newline at end of file

From 73e23772b8824f1e8b60bcae8c71eeb25c1bf3ea Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Mon, 20 Oct 2025 15:51:14 +0300
Subject: [PATCH 11/39] feat: task-cache prototype

---
 oellm/task_cache.py | 54 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 oellm/task_cache.py

diff --git a/oellm/task_cache.py b/oellm/task_cache.py
new file mode 100644
index 0000000..49f7f85
--- /dev/null
+++ b/oellm/task_cache.py
@@ -0,0 +1,54 @@
+import json
+from datetime import datetime
+from pathlib import Path
+
+
+TASK_CACHE_TTL_DAYS = 30
+
+
+def get_task_cache_file() -> Path:
+    return Path(__file__).resolve().parent / "task_map_cache.json"
+
+
+def load_task_cache() -> dict:
+    cache_file = get_task_cache_file()
+    if not cache_file.exists():
+        return {}
+    with open(cache_file, "r") as f:
+        return json.load(f) or {}
+
+
+def save_task_cache(cache: dict) -> None:
+    cache_file = get_task_cache_file()
+    with open(cache_file, "w") as f:
+        json.dump(cache, f, indent=2, sort_keys=True)
+
+
+def task_cache_key(framework: str, task_id: str) -> str:
+    return f"{framework}::{task_id}"
+
+
+def task_cache_is_fresh(entry: dict, ttl_days: int = TASK_CACHE_TTL_DAYS) -> bool:
+    ts = float(entry.get("ts", 0))
+    age_days = (datetime.now().timestamp() - ts) / 86400.0
+    return age_days >= 0 and age_days < float(ttl_days)
+
+
+def task_cache_lookup(
+    framework: str, task_id: str, ttl_days: int = TASK_CACHE_TTL_DAYS
+) -> bool:
+    cache = load_task_cache()
+    key = task_cache_key(framework, task_id)
+    entry = cache.get(key)
+    if not isinstance(entry, dict):
+        return False
+    return task_cache_is_fresh(entry, ttl_days)
+
+
+def task_cache_mark_resolved(framework: str, task_id: str) -> None:
+    cache = load_task_cache()
+    key = task_cache_key(framework, task_id)
+    cache[key] = {"ts": datetime.now().timestamp()}
+    save_task_cache(cache)
+
+

From f831fbc7b0ed9107b7f8df7ec9c86b581958affe Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 11:40:51 +0300
Subject: [PATCH 12/39] feat: task super groups

---
 .gitignore             |  1 +
 oellm/task-groups.yaml | 53 ++++++++++++++++++++----------------------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 77fc697..9e29ad2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,3 +13,4 @@
 **/*.egg-info
 **/*.csv
 **/uv.lock
+**/task_map_cache.json
\ No newline at end of file
diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml
index 8f91679..e69de26 100644
--- a/oellm/task-groups.yaml
+++ b/oellm/task-groups.yaml
@@ -1,7 +1,3 @@
-# Default task groups for interactive CSV builder
-# Each group contains a list of tasks with their n_shot values
-# Format: task_name,n_shot1,n_shot2,...
-
 task_groups:
   open-sci-0.01:
     description: "open-sci-ref 0.01 evals"
@@ -65,28 +61,28 @@ task_groups:
     n_shots: [0]
     tasks:
       - task: flores200:bul_Cyrl-eng_Latn
-      # - task: flores200:ces_Latn-eng_Latn
-      # - task: flores200:dan_Latn-eng_Latn
-      # - task: flores200:deu_Latn-eng_Latn
-      # - task: flores200:ell_Grek-eng_Latn
-      # - task: flores200:est_Latn-eng_Latn
-      # - task: flores200:fin_Latn-eng_Latn
-      # - task: flores200:fra_Latn-eng_Latn
-      # - task: flores200:gle_Latn-eng_Latn
-      # - task: flores200:hrv_Latn-eng_Latn
-      # - task: flores200:hun_Latn-eng_Latn
-      # - task: flores200:ita_Latn-eng_Latn
-      # - task: flores200:lit_Latn-eng_Latn
-      # - task: flores200:lvs_Latn-eng_Latn
-      # - task: flores200:mlt_Latn-eng_Latn
-      # - task: flores200:nld_Latn-eng_Latn     
-      # - task: flores200:pol_Latn-eng_Latn
-      # - task: flores200:por_Latn-eng_Latn
-      # - task: flores200:ron_Latn-eng_Latn
-      # - task: flores200:slk_Latn-eng_Latn
-      # - task: flores200:slv_Latn-eng_Latn
-      # - task: flores200:spa_Latn-eng_Latn
-      # - task: flores200:swe_Latn-eng_Latn
+      - task: flores200:ces_Latn-eng_Latn
+      - task: flores200:dan_Latn-eng_Latn
+      - task: flores200:deu_Latn-eng_Latn
+      - task: flores200:ell_Grek-eng_Latn
+      - task: flores200:est_Latn-eng_Latn
+      - task: flores200:fin_Latn-eng_Latn
+      - task: flores200:fra_Latn-eng_Latn
+      - task: flores200:gle_Latn-eng_Latn
+      - task: flores200:hrv_Latn-eng_Latn
+      - task: flores200:hun_Latn-eng_Latn
+      - task: flores200:ita_Latn-eng_Latn
+      - task: flores200:lit_Latn-eng_Latn
+      - task: flores200:lvs_Latn-eng_Latn
+      - task: flores200:mlt_Latn-eng_Latn
+      - task: flores200:nld_Latn-eng_Latn     
+      - task: flores200:pol_Latn-eng_Latn
+      - task: flores200:por_Latn-eng_Latn
+      - task: flores200:ron_Latn-eng_Latn
+      - task: flores200:slk_Latn-eng_Latn
+      - task: flores200:slv_Latn-eng_Latn
+      - task: flores200:spa_Latn-eng_Latn
+      - task: flores200:swe_Latn-eng_Latn
   flores-200-eng-to-eu:
     description: "Flores 200 English to EU translation"
     suite: lighteval
@@ -143,6 +139,7 @@ super_groups:
   oellm-multilingual:
     description: "Combined Belebele EU set plus multilingual benchmarks"
     task_groups:
-      # - task: belebele-eu-5-shot
       - task: flores-200-eu-to-eng
-      # - task: global-mmlu-eu
\ No newline at end of file
+      - task: flores-200-eng-to-eu
+      - task: belebele-eu-5-shot
+      - task: global-mmlu-eu
\ No newline at end of file

From 5fe62ee69554511df16a0efc6bf9e4f38d0e81af Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 11:41:29 +0300
Subject: [PATCH 13/39] task cache fix

---
 oellm/task_cache.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/oellm/task_cache.py b/oellm/task_cache.py
index 49f7f85..e457e77 100644
--- a/oellm/task_cache.py
+++ b/oellm/task_cache.py
@@ -2,7 +2,6 @@
 from datetime import datetime
 from pathlib import Path
 
-
 TASK_CACHE_TTL_DAYS = 30
 
 
@@ -50,5 +49,3 @@ def task_cache_mark_resolved(framework: str, task_id: str) -> None:
     key = task_cache_key(framework, task_id)
     cache[key] = {"ts": datetime.now().timestamp()}
     save_task_cache(cache)
-
-

From e816bfdff030de8ba7bc9c604eb746749693de79 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 22:32:12 +0300
Subject: [PATCH 14/39] fix: task cache; moving data files to oellm/resources

---
 oellm/interactive_csv_builder.py       |  16 +-
 oellm/main.py                          | 581 ++-----------------------
 oellm/resources/__init__.py            |   0
 oellm/{ => resources}/clusters.yaml    |   3 +-
 oellm/{ => resources}/task-groups.yaml |   8 +-
 oellm/{ => resources}/template.sbatch  |   2 +-
 oellm/task_cache.py                    | 285 +++++++++++-
 oellm/task_groups.py                   | 131 ++++++
 oellm/utils.py                         | 480 ++++++++++++++++++++
 pyproject.toml                         |  13 +-
 10 files changed, 941 insertions(+), 578 deletions(-)
 create mode 100644 oellm/resources/__init__.py
 rename oellm/{ => resources}/clusters.yaml (95%)
 rename oellm/{ => resources}/task-groups.yaml (96%)
 rename oellm/{ => resources}/template.sbatch (99%)
 create mode 100644 oellm/task_groups.py
 create mode 100644 oellm/utils.py

diff --git a/oellm/interactive_csv_builder.py b/oellm/interactive_csv_builder.py
index 9b918a2..61c99f1 100644
--- a/oellm/interactive_csv_builder.py
+++ b/oellm/interactive_csv_builder.py
@@ -1,5 +1,6 @@
 import signal
 import sys
+from importlib.resources import files
 from pathlib import Path
 
 import pandas as pd
@@ -118,16 +119,13 @@ def signal_handler(sig, frame):
     task_configs: list[tuple[str, list[int], str]] = []
     add_more = True
 
-    # Load task groups from YAML file
-    task_groups_file = Path(__file__).parent / "task-groups.yaml"
+    # Load task groups from packaged resources
     task_groups = {}
-    if task_groups_file.exists():
-        try:
-            with open(task_groups_file) as f:
-                data = yaml.safe_load(f)
-                task_groups = data.get("task_groups", {})
-        except Exception as e:
-            console.print(f"[yellow]Warning: Could not load task groups: {e}[/yellow]")
+    try:
+        data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text())
+        task_groups = data.get("task_groups", {})
+    except Exception as e:
+        console.print(f"[yellow]Warning: Could not load task groups: {e}[/yellow]")
 
     while add_more:
         choices = [
diff --git a/oellm/main.py b/oellm/main.py
index 72679e8..e04d87d 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -2,504 +2,32 @@
 import os
 import re
 import shutil
-import socket
 import subprocess
 from datetime import datetime
-from itertools import product
+from importlib.resources import files
 from pathlib import Path
 from string import Template
-from typing import Iterable
 
 import numpy as np
 import pandas as pd
-import yaml
 from jsonargparse import auto_cli
-from rich import box
-from rich.console import Console
-from rich.logging import RichHandler
-
-
-def _ensure_singularity_image(image_name: str) -> None:
-    # TODO: switch to OELLM dataset repo once it is created
-    from huggingface_hub import hf_hub_download
-
-    hf_repo = os.environ.get("HF_SIF_REPO", "timurcarstensen/testing")
-    image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name
-
-    try:
-        hf_hub_download(
-            repo_id=hf_repo,
-            filename=image_name,
-            repo_type="dataset",
-            local_dir=os.getenv("EVAL_BASE_DIR"),
-        )
-        logging.info(
-            "Successfully downloaded latest Singularity image from HuggingFace"
-        )
-    except Exception as e:
-        logging.warning(
-            "Failed to fetch latest container image from HuggingFace: %s", str(e)
-        )
-        if image_path.exists():
-            logging.info("Using existing Singularity image at %s", image_path)
-        else:
-            raise RuntimeError(
-                f"No container image found at {image_path} and failed to download from HuggingFace. "
-                f"Cannot proceed with evaluation scheduling."
-            ) from e
-
-    logging.info(
-        "Singularity image ready at %s",
-        Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"),
-    )
-
-
-def _setup_logging(verbose: bool = False):
-    rich_handler = RichHandler(
-        console=Console(),
-        show_time=True,
-        log_time_format="%H:%M:%S",
-        show_path=False,
-        markup=True,
-        rich_tracebacks=True,
-    )
-
-    class RichFormatter(logging.Formatter):
-        def format(self, record):
-            # Define colors for different log levels
-            record.msg = f"{record.getMessage()}"
-            return record.msg
-
-    rich_handler.setFormatter(RichFormatter())
-
-    root_logger = logging.getLogger()
-    root_logger.handlers = []  # Remove any default handlers
-    root_logger.addHandler(rich_handler)
-    root_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
-
-
-def _load_cluster_env() -> None:
-    """
-    Loads the correct cluster environment variables from `clusters.yaml` based on the hostname.
-    """
-    with open(Path(__file__).parent / "clusters.yaml") as f:
-        clusters = yaml.safe_load(f)
-    hostname = socket.gethostname()
-
-    # First load shared environment variables
-    shared_cfg = clusters.get("shared", {})
-
-    # match hostname to the regex in the clusters.yaml
-    for host in set(clusters.keys()) - {"shared"}:
-        pattern = clusters[host]["hostname_pattern"]
-        # Convert shell-style wildcards to regex
-        regex_pattern = pattern.replace(".", r"\.").replace("*", ".*")
-        if re.match(f"^{regex_pattern}$", hostname):
-            cluster_cfg = clusters[host]
-            break
-    else:
-        raise ValueError(f"No cluster found for hostname: {hostname}")
-
-    # Combine shared and cluster-specific configs, with cluster-specific taking precedence
-    # Remove hostname_pattern from the final config
-    if "hostname_pattern" in cluster_cfg:
-        del cluster_cfg["hostname_pattern"]
-
-    # Set environment variables, expanding any template variables
-    for k, v in cluster_cfg.items():
-        # Expand template variables using existing environment variables
-        os.environ[k] = str(v)
-
-    for k, v in shared_cfg.items():
-        try:
-            os.environ[k] = str(v).format(**cluster_cfg)
-        except KeyError as e:
-            # when substituting env vars that are not in cluster_cfg but in the environment (e.g., $USER, $SHELL, etc...)
-            if len(e.args) > 1:
-                raise ValueError(
-                    f"Env. variable substitution for {k} failed. Missing keys: {', '.join(e.args)}"
-                ) from e
-
-            missing_key: str = e.args[0]
-            os.environ[k] = str(v).format(
-                **cluster_cfg, **{missing_key: os.environ[missing_key]}
-            )
-
-
-def _num_jobs_in_queue() -> int:
-    # TODO avoid running in shell mode which is not secure
-    result = subprocess.run(
-        "squeue -u $USER -h -t pending,running -r | wc -l",
-        shell=True,
-        capture_output=True,
-        text=True,
-    )
-
-    if result.stdout:
-        try:
-            return int(result.stdout.strip())
-        except ValueError:
-            logging.warning(f"Could not parse squeue output: {result.stdout}")
-            return 0
-
-    if result.stderr:
-        logging.warning(f"squeue command produced an error: {result.stderr.strip()}")
-
-    return 0
-
-
-def _expand_local_model_paths(model: str) -> list[Path]:
-    """
-    Expands a local model path to include all checkpoints if it's a directory.
-    Recursively searches for models in subdirectories.
-
-    Args:
-        model: Path to a model or directory containing models
-
-    Returns:
-        List of paths to model directories containing safetensors files
-    """
-    model_paths = []
-    model_path = Path(model)
-
-    if not model_path.exists() or not model_path.is_dir():
-        return model_paths
-
-    # First check if current directory contains safetensors files
-    if any(model_path.glob("*.safetensors")):
-        model_paths.append(model_path)
-        # If current dir has safetensors, don't recurse further
-        return model_paths
-
-    # Check for hf subdirectory pattern (single model with checkpoints)
-    hf_path = model_path / "hf"
-    if hf_path.exists() and hf_path.is_dir():
-        # This is a single model with checkpoints in hf/iter_* structure
-        for subdir in hf_path.glob("*"):
-            if subdir.is_dir() and any(subdir.glob("*.safetensors")):
-                model_paths.append(subdir)
-        if model_paths:
-            return model_paths
-
-    # Check if subdirectories look like model directories
-    # (e.g., open-sci-ref_model-0.13b_data-c4_...)
-    subdirs = [d for d in model_path.iterdir() if d.is_dir()]
-
-    # Process each subdirectory as a potential model
-    for subdir in subdirs:
-        # Check if this subdirectory directly contains safetensors
-        if any(subdir.glob("*.safetensors")):
-            model_paths.append(subdir)
-        else:
-            # Check for hf/iter_* pattern in this subdirectory
-            hf_subpath = subdir / "hf"
-            if hf_subpath.exists() and hf_subpath.is_dir():
-                for checkpoint_dir in hf_subpath.glob("*"):
-                    if checkpoint_dir.is_dir() and any(
-                        checkpoint_dir.glob("*.safetensors")
-                    ):
-                        model_paths.append(checkpoint_dir)
-
-    if len(model_paths) > 1:
-        logging.info(f"Expanded '{model}' to {len(model_paths)} model checkpoints")
-
-    return model_paths
-
-
-def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
-    """
-    Processes model strings into a dict of model paths.
-
-    Each model string can be a local path or a huggingface model identifier.
-    This function expands directory paths that contain multiple checkpoints.
-    """
-    from huggingface_hub import snapshot_download
-
-    processed_model_paths = {}
-    model_paths = []
-    for model in models:
-        # First try to expand local paths
-        local_paths = _expand_local_model_paths(model)
-        if local_paths:
-            model_paths.extend(local_paths)
-        else:
-            logging.info(
-                f"Model {model} not found locally, assuming it is a 🤗 hub model"
-            )
-            logging.debug(
-                f"Downloading model {model} on the login node since the compute nodes may not have access to the internet"
-            )
-
-            if "," in model:
-                model_kwargs = dict(
-                    [kv.split("=") for kv in model.split(",") if "=" in kv]
-                )
-
-                # The first element before the comma is the repository ID on the 🤗 Hub
-                repo_id = model.split(",")[0]
-
-                # snapshot_download kwargs
-                snapshot_kwargs = {}
-                if "revision" in model_kwargs:
-                    snapshot_kwargs["revision"] = model_kwargs["revision"]
-
-                try:
-                    # Pre-download (or reuse cache) for the whole repository so that
-                    # compute nodes can load it offline.
-                    snapshot_download(
-                        repo_id=repo_id,
-                        cache_dir=Path(os.getenv("HF_HOME")) / "hub",
-                        **snapshot_kwargs,
-                    )
-                    model_paths.append(model)
-                except Exception as e:
-                    logging.debug(
-                        f"Failed to download model {model} from Hugging Face Hub. Continuing..."
-                    )
-                    logging.debug(e)
-            else:
-                # Download the entire model repository to the local cache.  The
-                # original identifier is kept in *model_paths* so downstream
-                # code can still reference it; at runtime the files will be
-                # read from cache, allowing offline execution.
-                snapshot_download(
-                    repo_id=model,
-                    cache_dir=Path(os.getenv("HF_HOME")) / "hub",
-                )
-                model_paths.append(model)
-
-        if not model_paths:
-            logging.warning(
-                f"Could not find any valid model for '{model}'. It will be skipped."
-            )
-        processed_model_paths[model] = model_paths
-    return processed_model_paths
-
-
-def _count_task_subtasks(task_name: str, task_manager) -> int:
-    from lm_eval.evaluator_utils import get_subtask_list  # type: ignore
-
-    task_objects = task_manager.load_task_or_group(task_name)
-    subtask_dict = get_subtask_list(task_objects)
-
-    total_subtasks = 0
-    for _, subtask_list in subtask_dict.items():
-        total_subtasks += len(subtask_list)
-
-    return max(1, total_subtasks)  # At least 1 subtask
-
-
-def _calculate_task_minutes(
-    task_name: str, task_manager, base_minutes_per_subtask: int = 5
-) -> int:
-    """Calculate estimated minutes for a task based on its subtask count."""
-    subtask_count = _count_task_subtasks(task_name, task_manager)
-
-    # Special handling for known multi-language tasks that take longer per subtask
-    known_complex_tasks = {
-        "belebele": 8,  # Multi-language reading comprehension, slower per subtask
-        "flores": 6,  # Translation task, moderately complex
-        "xnli": 6,  # Cross-lingual NLI
-        "xcopa": 6,  # Cross-lingual COPA
-        "xstory_cloze": 6,  # Cross-lingual story cloze
-        "paws-x": 6,  # Cross-lingual paraphrase detection
-        "hellaswag": 20,  # Hellaswag task, needs 20 minutes per subtask
-    }
-
-    # Use task-specific timing if available, otherwise use default
-    minutes_per_subtask = known_complex_tasks.get(
-        task_name.lower(), base_minutes_per_subtask
-    )
-
-    # Calculate total time: (subtasks × time_per_subtask) + base_overhead
-    base_overhead = 3  # Base overhead for task setup/teardown
-    total_minutes = max(10, (subtask_count * minutes_per_subtask) + base_overhead)
-
-    # Log for complex tasks (>5 subtasks) or any known complex task
-    if subtask_count > 5 or task_name.lower() in known_complex_tasks:
-        complexity_note = (
-            f" (known complex task, {minutes_per_subtask} min/subtask)"
-            if task_name.lower() in known_complex_tasks
-            else ""
-        )
-        logging.info(
-            f"📊 Task '{task_name}' has {subtask_count} subtasks{complexity_note}, "
-            f"estimated time: {total_minutes} minutes ({total_minutes / 60:.1f} hours)"
-        )
-
-    return total_minutes
-
-
-def _pre_download_task_datasets(
-    tasks: Iterable[str], trust_remote_code: bool = True
-) -> None:
-    """Ensure that all datasets required by the given `tasks` are present in the local 🤗 cache at $HF_HOME."""
-
-    from datasets import DownloadMode  # type: ignore
-    from lm_eval.tasks import TaskManager  # type: ignore
-
-    processed: set[str] = set()
-
-    tm = TaskManager()
-
-    for task_name in tasks:
-        if not isinstance(task_name, str) or task_name in processed:
-            continue
-        processed.add(task_name)
-
-        logging.info(
-            f"Preparing dataset for task '{task_name}' (download if not cached)…"
-        )
-
-        # Instantiating the task downloads the dataset (or reuses cache)
-
-        task_config = {
-            "task": task_name,
-            "dataset_kwargs": {"trust_remote_code": trust_remote_code},
-        }
-
-        task_objects = tm.load_config(task_config)
-
-        # Some entries might be nested dictionaries (e.g., groups)
-        stack = [task_objects]
-        while stack:
-            current = stack.pop()
-            if isinstance(current, dict):
-                stack.extend(current.values())
-                continue
-            if hasattr(current, "download") and callable(current.download):
-                try:
-                    current.download(download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS)  # type: ignore[arg-type]
-                except TypeError as e:
-                    logging.error(
-                        f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}"
-                    )
-                    current.download()  # type: ignore[misc]
-
-        logging.debug(f"Finished dataset preparation for task '{task_name}'.")
-
-
-def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
-    """Pre-download LightEval datasets by instantiating tasks via the local LightEval Registry."""
-    import sys
-
-    local_le_src = Path(__file__).parent.parent / "lighteval" / "src"
-    if local_le_src.exists():
-        sys.path.insert(0, str(local_le_src))
-
-    from lighteval.tasks.registry import Registry, TRUNCATE_FEW_SHOTS_DEFAULTS  # type: ignore
-    from lighteval.tasks.lighteval_task import LightevalTask  # type: ignore
-
-    file_task_specs: list[str] = []
-    string_task_specs: list[str] = []
-
-    for t in tasks:
-        raw = str(t).strip()
-        if not raw:
-            continue
-        candidate = Path(raw)
-        if candidate.exists() and candidate.is_file():
-            file_task_specs.append(str(candidate))
-        else:
-            spec = raw
-            truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
-            if "|" not in spec:
-                spec = f"lighteval|{spec}|0|{truncate_default}"
-            elif spec.count("|") == 1:
-                spec = f"{spec}|0|{truncate_default}"
-            elif spec.count("|") == 2:
-                spec = f"{spec}|{truncate_default}"
-            string_task_specs.append(spec)
-
-    unique_string_specs = sorted(set(string_task_specs))
-    unique_file_specs = sorted(set(file_task_specs))
-
-    if unique_string_specs:
-        reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
-        configs = reg.get_tasks_configs(",".join(unique_string_specs))
-        task_dict = reg.get_tasks_from_configs(configs)
-        LightevalTask.load_datasets(task_dict)
-
-    for fp in unique_file_specs:
-        reg_file = Registry()
-        configs_file = reg_file.get_tasks_configs(fp)
-        task_dict_file = reg_file.get_tasks_from_configs(configs_file)
-        LightevalTask.load_datasets(task_dict_file)
-
-def _load_task_groups() -> dict[str, dict]:
-    """Load task groups from `task-groups.yaml` located next to this module."""
-    groups_file = Path(__file__).parent / "task-groups.yaml"
-    if not groups_file.exists():
-        raise ValueError(f"Task groups file not found: {groups_file}")
-
-    with open(groups_file) as f:
-        data = yaml.safe_load(f) or {}
-
-    groups = data.get("task_groups") or {}
-    if not isinstance(groups, dict):
-        raise ValueError("Invalid task groups format in task-groups.yaml")
-
-    return groups
-
-
-def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]:
-    """
-    Expand task group names into concrete (task, n_shots, suite) tuples.
-
-    Supports nested groups. Defaults: suite=lm_eval, n_shots=[0] when absent.
-    A group's `suite` (if present) is inherited by its items and nested groups
-    unless a leaf explicitly overrides it.
-    """
-    groups = _load_task_groups()
-    resolved: list[tuple[str, list[int], str]] = []
-
-    def expand_group(group_name: str, stack: set[str], inherited_suite: str | None = None) -> None:
-        if group_name not in groups:
-            raise ValueError(f"Unknown task group: {group_name}")
-        if group_name in stack:
-            raise ValueError(f"Cyclic task group reference detected at '{group_name}'")
-
-        stack.add(group_name)
-        group_default_suite = groups[group_name].get("suite")
-        effective_inherited_suite = inherited_suite if inherited_suite is not None else group_default_suite
-
-        for item in groups[group_name].get("tasks", []):
-            task_identifier = str(item.get("task"))
-            # Prefer explicit suite on the item; otherwise inherit; otherwise default to lm_eval
-            item_suite = item.get("suite")
-            suite_name = (
-                str(item_suite)
-                if item_suite is not None
-                else (str(effective_inherited_suite) if effective_inherited_suite is not None else "lm_eval")
-            )
-            n_shots_value = item.get("n_shots")
-
-            # Nested group reference: propagate the resolved suite
-            if task_identifier in groups:
-                next_inherited = str(item_suite) if item_suite is not None else effective_inherited_suite
-                # Pass down only an inherited suite (or explicit item override) without defaulting to "lm_eval",
-                # so that the child group's own default `suite` can take effect if present.
-                expand_group(task_identifier, stack, next_inherited)
-                continue
-
-            # Leaf task
-            if not isinstance(n_shots_value, list):
-                n_shots: list[int] = [0]
-            else:
-                # Ensure ints
-                n_shots = [int(x) for x in n_shots_value]
-
-            resolved.append((task_identifier, n_shots, suite_name))
-        stack.remove(group_name)
-
-    for raw_name in group_names:
-        name = str(raw_name).strip()
-        if not name:
-            continue
-        expand_group(name, set(), None)
-
-    return resolved
 
+from oellm.task_cache import clear_task_cache
+from oellm.task_groups import _expand_task_groups
+from oellm.utils import (
+    _ensure_singularity_image,
+    _expand_local_model_paths,
+    _load_cluster_env,
+    _num_jobs_in_queue,
+    _pre_download_lighteval_datasets,
+    _pre_download_task_datasets,
+    _process_model_paths,
+    _setup_logging,
+    capture_third_party_output_from_kwarg,
+)
+
+
+@capture_third_party_output_from_kwarg("verbose")
 def schedule_evals(
     models: str | None = None,
     tasks: str | None = None,
@@ -557,7 +85,6 @@ def schedule_evals(
     else:
         logging.info("Skipping container image check (--skip-checks enabled)")
 
-
     if eval_csv_path:
         if models or tasks or task_groups or n_shot:
             raise ValueError(
@@ -610,6 +137,7 @@ def schedule_evals(
             logging.info(
                 "Skipping model path processing and validation (--skip-checks enabled)"
             )
+
     elif models and ((tasks and n_shot is not None) or task_groups):
         model_list = [m.strip() for m in models.split(",") if m.strip()]
         model_paths: list[Path | str] = []
@@ -677,7 +205,9 @@ def schedule_evals(
                             }
                         )
 
-        df = pd.DataFrame(rows, columns=["model_path", "task_path", "n_shot", "eval_suite"])
+        df = pd.DataFrame(
+            rows, columns=["model_path", "task_path", "n_shot", "eval_suite"]
+        )
     else:
         raise ValueError(
             "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`."
@@ -741,71 +271,20 @@ def schedule_evals(
 
     logging.debug(f"Saved evaluation dataframe to temporary CSV: {csv_path}")
 
-    with open(Path(__file__).parent / "template.sbatch") as f:
-        sbatch_template = f.read()
+    sbatch_template = (files("oellm.resources") / "template.sbatch").read_text()
 
     # Calculate dynamic array size and time limits
     total_evals = len(df)
 
-    # Calculate time based on actual task complexity (subtask count)
-    if not skip_checks:
-        from lm_eval.tasks import TaskManager  # type: ignore
-
-        shared_task_manager = TaskManager()
-
-        # Calculate total minutes by considering each unique task's complexity
-        total_minutes = 0
-        task_time_cache = {}  # Cache to avoid recalculating for same tasks
-
-        lm_eval_mask = df["eval_suite"].str.lower().isin(
-            {"lm_eval", "lm-eval", "lm-eval-harness"}
-        )
-        light_eval_mask = df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
-
-        for _, row in df[lm_eval_mask].iterrows():
-            task_name = row["task_path"]
-            if task_name not in task_time_cache:
-                task_time_cache[task_name] = _calculate_task_minutes(
-                    task_name, task_manager=shared_task_manager
-                )
-            total_minutes += task_time_cache[task_name]
-
-        if light_eval_mask.any():
-            # LightEval benchmarks can be large; budget 15 minutes per evaluation
-            light_eval_minutes = int(light_eval_mask.sum() * 15)
-            total_minutes += light_eval_minutes
-            logging.info(
-                "Estimated LightEval time budget: %s minutes across %s evaluations",
-                light_eval_minutes,
-                light_eval_mask.sum(),
-            )
-
-        # Calculate average minutes per eval for logging purposes
-        minutes_per_eval = total_minutes / total_evals if total_evals > 0 else 10
-
-        logging.info("📊 Dynamic time calculation:")
-        for task_name, task_minutes in task_time_cache.items():
-            task_count = (
-                (df["task_path"] == task_name)
-                & df["eval_suite"].str.lower().isin(
-                    {"lm_eval", "lm-eval", "lm-eval-harness"}
-                )
-            ).sum()
-            logging.info(
-                f"   Task '{task_name}': {task_minutes} min/eval × {task_count} evals = {task_minutes * task_count} total minutes"
-            )
-    else:
-        # Fallback to fixed timing when checks are skipped
-        minutes_per_eval = 10  # Budget 10 minutes per eval
-        total_minutes = total_evals * minutes_per_eval
-        logging.info(
-            "⚠️  Using fixed 10 min/eval (task complexity detection skipped with --skip-checks)"
-        )
+    # fixed timing estimation
+    minutes_per_eval = 10  # Budget 10 minutes per eval
+    total_minutes = total_evals * minutes_per_eval
 
     # Copy LightEval benchmark files into evaluation directory if necessary
-    light_eval_paths = df[
-        df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
-    ]["task_path"].unique()
+    # TODO: why do we need this?
+    light_eval_paths = df[df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})][
+        "task_path"
+    ].unique()
     benchmark_dir = evals_dir / "light_eval_tasks"
     copied_paths: dict[str, str] = {}
     if light_eval_paths.size > 0:
@@ -1063,7 +542,6 @@ def collect_results(
                         f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}"
                     )
 
-
     if not rows and not check:
         logging.warning("No results extracted from JSON files")
         return
@@ -1151,6 +629,7 @@ def main():
             "schedule-eval": schedule_evals,
             "build-csv": build_csv,
             "collect-results": collect_results,
+            "clean-cache": lambda: clear_task_cache(),
         },
         as_positional=False,
         description="OELLM: Multi-cluster evaluation tool for language models",
diff --git a/oellm/resources/__init__.py b/oellm/resources/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/oellm/clusters.yaml b/oellm/resources/clusters.yaml
similarity index 95%
rename from oellm/clusters.yaml
rename to oellm/resources/clusters.yaml
index 5e14325..d6da6d2 100644
--- a/oellm/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -5,6 +5,7 @@ shared:
   HF_HOME: "{EVAL_BASE_DIR}/hf_data"  # where HuggingFace models and datasets are stored
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
+  HF_HUB_DISABLE_PROGRESS_BARS: "1"
 
 leonardo:
   hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
@@ -31,4 +32,4 @@ lumi:
   ACCOUNT: "project_462000963"
   QUEUE_LIMIT: 210
   EVAL_CONTAINER_IMAGE: "eval_env-lumi.sif"
-  SINGULARITY_ARGS: "--rocm"
\ No newline at end of file
+  SINGULARITY_ARGS: "--rocm"
diff --git a/oellm/task-groups.yaml b/oellm/resources/task-groups.yaml
similarity index 96%
rename from oellm/task-groups.yaml
rename to oellm/resources/task-groups.yaml
index e69de26..f9b7684 100644
--- a/oellm/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -75,7 +75,7 @@ task_groups:
       - task: flores200:lit_Latn-eng_Latn
       - task: flores200:lvs_Latn-eng_Latn
       - task: flores200:mlt_Latn-eng_Latn
-      - task: flores200:nld_Latn-eng_Latn     
+      - task: flores200:nld_Latn-eng_Latn
       - task: flores200:pol_Latn-eng_Latn
       - task: flores200:por_Latn-eng_Latn
       - task: flores200:ron_Latn-eng_Latn
@@ -103,7 +103,7 @@ task_groups:
       - task: flores200:eng_Latn-lit_Latn
       - task: flores200:eng_Latn-lvs_Latn
       - task: flores200:eng_Latn-mlt_Latn
-      - task: flores200:eng_Latn-nld_Latn     
+      - task: flores200:eng_Latn-nld_Latn
       - task: flores200:eng_Latn-pol_Latn
       - task: flores200:eng_Latn-por_Latn
       - task: flores200:eng_Latn-ron_Latn
@@ -140,6 +140,6 @@ super_groups:
     description: "Combined Belebele EU set plus multilingual benchmarks"
     task_groups:
       - task: flores-200-eu-to-eng
-      - task: flores-200-eng-to-eu
+      # - task: flores-200-eng-to-eu
       - task: belebele-eu-5-shot
-      - task: global-mmlu-eu
\ No newline at end of file
+      # - task: global-mmlu-eu
diff --git a/oellm/template.sbatch b/oellm/resources/template.sbatch
similarity index 99%
rename from oellm/template.sbatch
rename to oellm/resources/template.sbatch
index f4b4905..b68d637 100644
--- a/oellm/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -147,4 +147,4 @@ do
 
 done
 
-echo "Job $SLURM_ARRAY_TASK_ID finished."
\ No newline at end of file
+echo "Job $SLURM_ARRAY_TASK_ID finished."
diff --git a/oellm/task_cache.py b/oellm/task_cache.py
index e457e77..d8be806 100644
--- a/oellm/task_cache.py
+++ b/oellm/task_cache.py
@@ -1,4 +1,6 @@
 import json
+import logging
+from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 
@@ -6,15 +8,15 @@
 
 
 def get_task_cache_file() -> Path:
-    return Path(__file__).resolve().parent / "task_map_cache.json"
+    return Path(__file__).resolve().parent / "resources" / "task_map_cache.json"
 
 
 def load_task_cache() -> dict:
     cache_file = get_task_cache_file()
-    if not cache_file.exists():
-        return {}
-    with open(cache_file, "r") as f:
-        return json.load(f) or {}
+    if cache_file.exists():
+        with open(cache_file) as f:
+            return json.load(f) or {}
+    return {}
 
 
 def save_task_cache(cache: dict) -> None:
@@ -23,6 +25,12 @@ def save_task_cache(cache: dict) -> None:
         json.dump(cache, f, indent=2, sort_keys=True)
 
 
+def clear_task_cache() -> None:
+    cache_file = get_task_cache_file()
+    with open(cache_file, "w") as f:
+        json.dump({}, f)
+
+
 def task_cache_key(framework: str, task_id: str) -> str:
     return f"{framework}::{task_id}"
 
@@ -47,5 +55,270 @@ def task_cache_lookup(
 def task_cache_mark_resolved(framework: str, task_id: str) -> None:
     cache = load_task_cache()
     key = task_cache_key(framework, task_id)
-    cache[key] = {"ts": datetime.now().timestamp()}
+    entry = cache.get(key) if isinstance(cache.get(key), dict) else {}
+    entry["ts"] = datetime.now().timestamp()
+    cache[key] = entry
+    save_task_cache(cache)
+
+
+def task_cache_get_payload(framework: str, task_id: str) -> dict | None:
+    cache = load_task_cache()
+    key = task_cache_key(framework, task_id)
+    entry = cache.get(key)
+    if not isinstance(entry, dict):
+        return None
+    payload = entry.get("payload")
+    return payload if isinstance(payload, dict) else None
+
+
+def task_cache_set_payload(framework: str, task_id: str, payload: dict) -> None:
+    cache = load_task_cache()
+    key = task_cache_key(framework, task_id)
+    entry: dict = cache.get(key) if isinstance(cache.get(key), dict) else {}  # type: ignore[assignment]
+    entry["ts"] = datetime.now().timestamp()
+    entry["payload"] = payload
+    cache[key] = entry
     save_task_cache(cache)
+
+
+def _canonical_key(call: dict) -> tuple:
+    t = call.get("type")
+    if t == "load_dataset":
+        return (
+            t,
+            call.get("path"),
+            call.get("name"),
+            call.get("split"),
+            call.get("revision"),
+        )
+    if t == "snapshot_download":
+        return (
+            t,
+            call.get("repo_id"),
+            call.get("repo_type"),
+            call.get("revision"),
+        )
+    if t == "hf_hub_download":
+        return (
+            t,
+            call.get("repo_id"),
+            call.get("filename"),
+            call.get("repo_type"),
+            call.get("revision"),
+        )
+    return (str(t),)
+
+
+def dedupe_calls(calls: list[dict]) -> list[dict]:
+    if not isinstance(calls, list):
+        return []
+    best: dict[tuple, dict] = {}
+    for c in calls:
+        if not isinstance(c, dict):
+            continue
+        key = _canonical_key(c)
+        existing = best.get(key)
+        if existing is None:
+            best[key] = c
+            continue
+        # Prefer trust_remote_code=True for load_dataset
+        if c.get("type") == "load_dataset":
+            if bool(c.get("trust_remote_code")) and not bool(
+                existing.get("trust_remote_code")
+            ):
+                best[key] = c
+    # Optionally drop snapshot_download if matching load_dataset exists
+    filtered: list[dict] = []
+    load_keys = {
+        ("load_dataset", k[1], k[2], k[3], k[4])
+        for k in best.keys()
+        if k and k[0] == "load_dataset"
+    }
+    for k, v in best.items():
+        if k and k[0] == "snapshot_download":
+            # derive comparable key shape: (type, repo_id, None, None, revision)
+            comparable = ("load_dataset", k[1], None, None, k[3])
+            if comparable in load_keys:
+                continue
+        filtered.append(v)
+    return filtered
+
+
+@contextmanager
+def capture_hf_dataset_calls():
+    captured: list[dict] = []
+
+    import datasets as _ds  # type: ignore
+    import huggingface_hub as _hfh  # type: ignore
+
+    _orig_load_dataset = _ds.load_dataset
+    _orig_snapshot_download = _hfh.snapshot_download
+    _orig_hf_hub_download = _hfh.hf_hub_download
+
+    def _load_dataset_proxy(path, *args, **kwargs):  # noqa: ANN001
+        name = (
+            kwargs.get("name")
+            if "name" in kwargs
+            else (args[0] if len(args) > 0 else None)
+        )
+        data_files = (
+            kwargs.get("data_files")
+            if "data_files" in kwargs
+            else (args[1] if len(args) > 1 else None)
+        )
+        split = (
+            kwargs.get("split")
+            if "split" in kwargs
+            else (args[2] if len(args) > 2 else None)
+        )
+        trust_remote_code = kwargs.get("trust_remote_code")
+        revision = kwargs.get("revision")
+        captured.append(
+            {
+                "type": "load_dataset",
+                "path": path,
+                "name": name,
+                "data_files": data_files,
+                "split": split,
+                "revision": revision,
+                "trust_remote_code": trust_remote_code,
+            }
+        )
+        return _orig_load_dataset(path, *args, **kwargs)
+
+    def _snapshot_download_proxy(*args, **kwargs):  # noqa: ANN001
+        repo_id = (
+            kwargs.get("repo_id")
+            if "repo_id" in kwargs
+            else (args[0] if len(args) > 0 else None)
+        )
+        repo_type = (
+            kwargs.get("repo_type")
+            if "repo_type" in kwargs
+            else (args[1] if len(args) > 1 else None)
+        )
+        revision = (
+            kwargs.get("revision")
+            if "revision" in kwargs
+            else (args[2] if len(args) > 2 else None)
+        )
+        captured.append(
+            {
+                "type": "snapshot_download",
+                "repo_id": repo_id,
+                "repo_type": repo_type,
+                "revision": revision,
+            }
+        )
+        return _orig_snapshot_download(*args, **kwargs)
+
+    def _hf_hub_download_proxy(*args, **kwargs):  # noqa: ANN001
+        repo_id = (
+            kwargs.get("repo_id")
+            if "repo_id" in kwargs
+            else (args[0] if len(args) > 0 else None)
+        )
+        filename = (
+            kwargs.get("filename")
+            if "filename" in kwargs
+            else (args[1] if len(args) > 1 else None)
+        )
+        repo_type = (
+            kwargs.get("repo_type")
+            if "repo_type" in kwargs
+            else (args[2] if len(args) > 2 else None)
+        )
+        revision = (
+            kwargs.get("revision")
+            if "revision" in kwargs
+            else (args[3] if len(args) > 3 else None)
+        )
+        captured.append(
+            {
+                "type": "hf_hub_download",
+                "repo_id": repo_id,
+                "filename": filename,
+                "repo_type": repo_type,
+                "revision": revision,
+            }
+        )
+        return _orig_hf_hub_download(*args, **kwargs)
+
+    _ds.load_dataset = _load_dataset_proxy  # type: ignore[assignment]
+    _hfh.snapshot_download = _snapshot_download_proxy  # type: ignore[assignment]
+    _hfh.hf_hub_download = _hf_hub_download_proxy  # type: ignore[assignment]
+    try:
+        yield captured
+    finally:
+        _ds.load_dataset = _orig_load_dataset  # type: ignore[assignment]
+        _hfh.snapshot_download = _orig_snapshot_download  # type: ignore[assignment]
+        _hfh.hf_hub_download = _orig_hf_hub_download  # type: ignore[assignment]
+
+
+def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True) -> None:
+    if not isinstance(payload, dict):
+        return
+    calls = payload.get("calls")
+    if not isinstance(calls, list):
+        return
+
+    from datasets import load_dataset  # type: ignore
+    from huggingface_hub import hf_hub_download, snapshot_download  # type: ignore
+
+    for call in calls:
+        if not isinstance(call, dict):
+            continue
+        # Unified prewarm log message
+        dataset_id = None
+        if call.get("type") == "load_dataset":
+            path = call.get("path")
+            name = call.get("name")
+            dataset_id = f"{path}{'::' + name if name else ''}"
+        else:
+            repo_id = call.get("repo_id")
+            filename = call.get("filename")
+            dataset_id = (
+                f"{repo_id}{'/' + filename if filename else ''}"
+                if isinstance(repo_id, str)
+                else None
+            )
+        if dataset_id:
+            logging.info(f"Prewarming dataset cache: {dataset_id}")
+        if call.get("type") == "snapshot_download":
+            repo_id = call.get("repo_id")
+            if isinstance(repo_id, str) and repo_id:
+                snapshot_download(
+                    repo_id=repo_id,
+                    repo_type=call.get("repo_type") or "dataset",
+                    revision=call.get("revision"),
+                )
+            continue
+        if call.get("type") == "hf_hub_download":
+            repo_id = call.get("repo_id")
+            filename = call.get("filename")
+            if isinstance(repo_id, str) and isinstance(filename, str):
+                hf_hub_download(
+                    repo_id=repo_id,
+                    filename=filename,
+                    repo_type=call.get("repo_type"),
+                    revision=call.get("revision"),
+                )
+            continue
+        path = call.get("path")
+        name = call.get("name")
+        data_files = call.get("data_files")
+        split = call.get("split")
+        revision = call.get("revision")
+        trc = call.get("trust_remote_code", trust_remote_code)
+        kwargs: dict = {}
+        if name is not None:
+            kwargs["name"] = name
+        if data_files is not None:
+            kwargs["data_files"] = data_files
+        if revision is not None:
+            kwargs["revision"] = revision
+        kwargs["trust_remote_code"] = bool(trc)
+        if split is not None:
+            load_dataset(path, split=split, **kwargs)
+        else:
+            load_dataset(path, **kwargs)
diff --git a/oellm/task_groups.py b/oellm/task_groups.py
new file mode 100644
index 0000000..1419de7
--- /dev/null
+++ b/oellm/task_groups.py
@@ -0,0 +1,131 @@
+from collections.abc import Iterable
+from dataclasses import dataclass
+from importlib.resources import files
+
+import yaml
+
+
+@dataclass
+class _Task:
+    name: str
+    n_shots: list[int] | None = None
+
+
+@dataclass
+class TaskGroup:
+    name: str
+    tasks: list[_Task]
+    suite: str
+    description: str
+    n_shots: list[int] | None = None
+
+    def __post_init__(self):
+        for task in self.tasks:
+            if task.n_shots is None and self.n_shots is not None:
+                task.n_shots = self.n_shots
+            elif task.n_shots is None and self.n_shots is None:
+                raise ValueError(
+                    f"N_shots is not set for task {task.name} and no default n_shots is set for the task group: {self.name}"
+                )
+
+    @classmethod
+    def from_dict(cls, name: str, data: dict) -> "TaskGroup":
+        tasks = []
+        for task_data in data["tasks"]:
+            task_name = task_data["task"]
+            task_n_shots = task_data.get("n_shots")
+            tasks.append(_Task(name=task_name, n_shots=task_n_shots))
+
+        return cls(
+            name=name,
+            tasks=tasks,
+            suite=data["suite"],
+            description=data["description"],
+            n_shots=data.get("n_shots"),
+        )
+
+
+@dataclass
+class TaskSuperGroup:
+    name: str
+    task_groups: list[TaskGroup]
+    description: str
+
+    def __post_init__(self):
+        resolved_groups = []
+        for group in self.task_groups:
+            if isinstance(group, str):
+                raise ValueError(
+                    f"Task group '{group}' not found in available task groups"
+                )
+            resolved_groups.append(group)
+        self.task_groups = resolved_groups
+
+    @classmethod
+    def from_dict(
+        cls, name: str, data: dict, available_task_groups: dict[str, TaskGroup]
+    ) -> "TaskSuperGroup":
+        task_groups = []
+        for task_group_data in data["task_groups"]:
+            group_name = task_group_data["task"]
+            if group_name not in available_task_groups:
+                raise ValueError(
+                    f"Task group '{group_name}' not found in available task groups"
+                )
+            task_groups.append(available_task_groups[group_name])
+
+        return cls(
+            name=name,
+            task_groups=task_groups,
+            description=data["description"],
+        )
+
+
+def _parse_task_groups(
+    requested_groups: list[str],
+) -> dict[str, TaskSuperGroup | TaskGroup]:
+    data = (
+        yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) or {}
+    )
+
+    task_groups: dict[str, TaskGroup] = {}
+
+    for task_group_name, task_data in data["task_groups"].items():
+        task_groups[task_group_name] = TaskGroup.from_dict(task_group_name, task_data)
+
+    super_groups: dict[str, TaskSuperGroup] = {}
+    for super_group_name, super_group_data in data.get("super_groups", {}).items():
+        super_groups[super_group_name] = TaskSuperGroup.from_dict(
+            super_group_name, super_group_data, task_groups
+        )
+
+    result = {**task_groups, **super_groups}
+    return {
+        group_name: group
+        for group_name, group in result.items()
+        if group_name in requested_groups
+    }
+
+
+def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]:
+    parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
+    missing = {str(n).strip() for n in group_names if str(n).strip()} - set(parsed.keys())
+    if missing:
+        raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}")
+
+    results: list[tuple[str, list[int], str]] = []
+
+    for _, group in parsed.items():
+        if isinstance(group, TaskGroup):
+            suite = group.suite
+            for t in group.tasks:
+                shots = [int(s) for s in (t.n_shots or [])]
+                results.append((t.name, shots, suite))
+        else:
+            for g in group.task_groups:
+                suite = g.suite
+                for t in g.tasks:
+                    shots = [int(s) for s in (t.n_shots or [])]
+                    results.append((t.name, shots, suite))
+
+    return results
diff --git a/oellm/utils.py b/oellm/utils.py
new file mode 100644
index 0000000..63927a5
--- /dev/null
+++ b/oellm/utils.py
@@ -0,0 +1,480 @@
+import builtins
+import fnmatch
+import logging
+import os
+import socket
+import subprocess
+import sys
+from collections.abc import Iterable
+from contextlib import contextmanager
+from functools import wraps
+from importlib.resources import files
+from pathlib import Path
+
+import yaml
+from rich.console import Console
+from rich.logging import RichHandler
+
+from oellm.task_cache import (
+    capture_hf_dataset_calls,
+    dedupe_calls,
+    prewarm_from_payload,
+    task_cache_get_payload,
+    task_cache_lookup,
+    task_cache_mark_resolved,
+    task_cache_set_payload,
+)
+
+
+def _ensure_singularity_image(image_name: str) -> None:
+    from huggingface_hub import hf_hub_download
+
+    image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name
+
+    try:
+        hf_hub_download(
+            repo_id="openeurollm/evaluation_singularity_images",
+            filename=image_name,
+            repo_type="dataset",
+            local_dir=os.getenv("EVAL_BASE_DIR"),
+        )
+        logging.info("Successfully downloaded latest Singularity image from HuggingFace")
+    except Exception as e:
+        logging.warning(
+            "Failed to fetch latest container image from HuggingFace: %s", str(e)
+        )
+        if image_path.exists():
+            logging.info("Using existing Singularity image at %s", image_path)
+        else:
+            raise RuntimeError(
+                f"No container image found at {image_path} and failed to download from HuggingFace. "
+                f"Cannot proceed with evaluation scheduling."
+            ) from e
+
+    logging.info(
+        "Singularity image ready at %s",
+        Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"),
+    )
+
+
+def _setup_logging(verbose: bool = False):
+    rich_handler = RichHandler(
+        console=Console(),
+        show_time=True,
+        log_time_format="%H:%M:%S",
+        show_path=False,
+        markup=True,
+        rich_tracebacks=True,
+    )
+
+    class RichFormatter(logging.Formatter):
+        def format(self, record):
+            record.msg = f"{record.getMessage()}"
+            return record.msg
+
+    rich_handler.setFormatter(RichFormatter())
+
+    root_logger = logging.getLogger()
+    root_logger.handlers = []
+    root_logger.addHandler(rich_handler)
+    root_logger.setLevel(logging.DEBUG if verbose else logging.INFO)
+
+
+def _load_cluster_env() -> None:
+    """
+    Loads the correct cluster environment variables from `clusters.yaml` based on the hostname.
+    """
+    clusters = yaml.safe_load((files("oellm.resources") / "clusters.yaml").read_text())
+    hostname = socket.gethostname()
+
+    shared_cfg = clusters.get("shared", {}) or {}
+
+    cluster_cfg_raw: dict | None = None
+    for name, cfg in clusters.items():
+        if name == "shared":
+            continue
+        pattern = cfg.get("hostname_pattern")
+        if isinstance(pattern, str) and fnmatch.fnmatch(hostname, pattern):
+            cluster_cfg_raw = dict(cfg)
+            break
+    if cluster_cfg_raw is None:
+        raise ValueError(f"No cluster found for hostname: {hostname}")
+
+    cluster_cfg_raw.pop("hostname_pattern", None)
+
+    class _Default(dict):
+        def __missing__(self, key):
+            return "{" + key + "}"
+
+    base_ctx = _Default({**os.environ, **{k: str(v) for k, v in cluster_cfg_raw.items()}})
+
+    resolved_shared = {k: str(v).format_map(base_ctx) for k, v in shared_cfg.items()}
+
+    ctx = _Default({**base_ctx, **resolved_shared})
+
+    resolved_cluster = {k: str(v).format_map(ctx) for k, v in cluster_cfg_raw.items()}
+
+    final_env = {**resolved_shared, **resolved_cluster}
+    for k, v in final_env.items():
+        os.environ[k] = v
+
+
+def _num_jobs_in_queue() -> int:
+    user = os.environ.get("USER")
+    cmd: list[str] = ["squeue"]
+    if user:
+        cmd += ["-u", user]
+    cmd += ["-h", "-t", "pending,running", "-r", "-o", "%i"]
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        if result.stderr:
+            logging.warning(f"squeue error: {result.stderr.strip()}")
+        return 0
+
+    output = result.stdout.strip()
+    if not output:
+        return 0
+    return sum(1 for line in output.splitlines() if line.strip())
+
+
+def _expand_local_model_paths(model: str) -> list[Path]:
+    """
+    Expands a local model path to include all checkpoints if it's a directory.
+    Recursively searches for models in subdirectories.
+
+    Args:
+        model: Path to a model or directory containing models
+
+    Returns:
+        List of paths to model directories containing safetensors files
+    """
+    model_paths = []
+    model_path = Path(model)
+
+    if not model_path.exists() or not model_path.is_dir():
+        return model_paths
+
+    if any(model_path.glob("*.safetensors")):
+        model_paths.append(model_path)
+        return model_paths
+
+    hf_path = model_path / "hf"
+    if hf_path.exists() and hf_path.is_dir():
+        for subdir in hf_path.glob("*"):
+            if subdir.is_dir() and any(subdir.glob("*.safetensors")):
+                model_paths.append(subdir)
+        if model_paths:
+            return model_paths
+
+    subdirs = [d for d in model_path.iterdir() if d.is_dir()]
+
+    for subdir in subdirs:
+        if any(subdir.glob("*.safetensors")):
+            model_paths.append(subdir)
+        else:
+            hf_subpath = subdir / "hf"
+            if hf_subpath.exists() and hf_subpath.is_dir():
+                for checkpoint_dir in hf_subpath.glob("*"):
+                    if checkpoint_dir.is_dir() and any(
+                        checkpoint_dir.glob("*.safetensors")
+                    ):
+                        model_paths.append(checkpoint_dir)
+
+    if len(model_paths) > 1:
+        logging.info(f"Expanded '{model}' to {len(model_paths)} model checkpoints")
+
+    return model_paths
+
+
+def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
+    """
+    Processes model strings into a dict of model paths.
+
+    Each model string can be a local path or a huggingface model identifier.
+    This function expands directory paths that contain multiple checkpoints.
+    """
+    from huggingface_hub import snapshot_download
+
+    processed_model_paths: dict[str, list[Path | str]] = {}
+
+    for model in models:
+        per_model_paths: list[Path | str] = []
+
+        local_paths = _expand_local_model_paths(model)
+        if local_paths:
+            per_model_paths.extend(local_paths)
+        else:
+            logging.info(
+                f"Model {model} not found locally, assuming it is a 🤗 hub model"
+            )
+            logging.debug(
+                f"Downloading model {model} on the login node since the compute nodes may not have access to the internet"
+            )
+
+            if "," in model:
+                model_kwargs = dict(
+                    [kv.split("=") for kv in model.split(",") if "=" in kv]
+                )
+
+                repo_id = model.split(",")[0]
+
+                snapshot_kwargs = {}
+                if "revision" in model_kwargs:
+                    snapshot_kwargs["revision"] = model_kwargs["revision"]
+
+                try:
+                    snapshot_download(
+                        repo_id=repo_id,
+                        cache_dir=Path(os.getenv("HF_HOME")) / "hub",
+                        **snapshot_kwargs,
+                    )
+                    per_model_paths.append(model)
+                except Exception as e:
+                    logging.debug(
+                        f"Failed to download model {model} from Hugging Face Hub. Continuing..."
+                    )
+                    logging.debug(e)
+            else:
+                snapshot_download(
+                    repo_id=model,
+                    cache_dir=Path(os.getenv("HF_HOME")) / "hub",
+                )
+                per_model_paths.append(model)
+
+        if not per_model_paths:
+            logging.warning(
+                f"Could not find any valid model for '{model}'. It will be skipped."
+            )
+        processed_model_paths[model] = per_model_paths
+
+    return processed_model_paths
+
+
+def _pre_download_task_datasets(
+    tasks: Iterable[str], trust_remote_code: bool = True
+) -> None:
+    processed: set[str] = set()
+
+    misses: list[str] = []
+    for task_name in tasks:
+        if not isinstance(task_name, str) or task_name in processed:
+            continue
+        processed.add(task_name)
+        if task_cache_lookup("lm-eval", task_name):
+            logging.info(
+                f"Skipping dataset preparation for task '{task_name}' (cache hit within TTL)."
+            )
+            continue
+        misses.append(task_name)
+
+    if not misses:
+        for task_name in processed:
+            if task_cache_lookup("lm-eval", task_name):
+                prewarm_from_payload(
+                    task_cache_get_payload("lm-eval", task_name),
+                    trust_remote_code=trust_remote_code,
+                )
+        return
+
+    from datasets import DownloadMode  # type: ignore
+    from lm_eval.tasks import TaskManager  # type: ignore
+
+    tm = TaskManager()
+
+    for task_name in misses:
+        logging.info(
+            f"Preparing dataset for task '{task_name}' (download if not cached)…"
+        )
+
+        task_config = {
+            "task": task_name,
+            "dataset_kwargs": {"trust_remote_code": trust_remote_code},
+        }
+
+        with capture_hf_dataset_calls() as captured_calls:
+            task_objects = tm.load_config(task_config)
+
+            stack = [task_objects]
+            while stack:
+                current = stack.pop()
+                if isinstance(current, dict):
+                    stack.extend(current.values())
+                    continue
+                if hasattr(current, "download") and callable(current.download):
+                    try:
+                        current.download(
+                            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS
+                        )  # type: ignore[arg-type]
+                    except TypeError as e:
+                        logging.error(
+                            f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}"
+                        )
+                        current.download()  # type: ignore[misc]
+
+        if captured_calls:
+            payload = {"calls": dedupe_calls(captured_calls)}
+            task_cache_set_payload("lm-eval", task_name, payload)
+        task_cache_mark_resolved("lm-eval", task_name)
+        logging.debug(f"Finished dataset preparation for task '{task_name}'.")
+
+
+def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
+    misses: list[str] = []
+    processed: set[str] = set()
+    for t in tasks:
+        raw = str(t).strip()
+        if not raw or raw in processed:
+            continue
+        processed.add(raw)
+        if task_cache_lookup("lighteval", raw):
+            logging.info(
+                f"Skipping dataset preparation for LightEval task '{raw}' (cache hit within TTL)."
+            )
+            continue
+        misses.append(raw)
+
+    if not misses:
+        for raw in processed:
+            if task_cache_lookup("lighteval", raw):
+                prewarm_from_payload(
+                    task_cache_get_payload("lighteval", raw),
+                    trust_remote_code=True,
+                )
+        return
+
+    from lighteval.tasks.lighteval_task import LightevalTask  # type: ignore
+    from lighteval.tasks.registry import (  # type: ignore
+        TRUNCATE_FEW_SHOTS_DEFAULTS,
+        Registry,
+    )
+
+    for raw in misses:
+        candidate = Path(raw)
+        if candidate.exists() and candidate.is_file():
+            with capture_hf_dataset_calls() as captured_calls:
+                reg_file = Registry()
+                configs_file = reg_file.get_tasks_configs(str(candidate))
+                task_dict_file = reg_file.get_tasks_from_configs(configs_file)
+                LightevalTask.load_datasets(task_dict_file)
+            if captured_calls:
+                payload = {"calls": dedupe_calls(captured_calls)}
+                task_cache_set_payload("lighteval", raw, payload)
+            task_cache_mark_resolved("lighteval", raw)
+            continue
+
+        # Build single-spec string and load in isolation
+        spec = raw
+        truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
+        if "|" not in spec:
+            spec = f"lighteval|{spec}|0|{truncate_default}"
+        elif spec.count("|") == 1:
+            spec = f"{spec}|0|{truncate_default}"
+        elif spec.count("|") == 2:
+            spec = f"{spec}|{truncate_default}"
+
+        with capture_hf_dataset_calls() as captured_calls:
+            reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
+            configs = reg.get_tasks_configs(spec)
+            task_dict = reg.get_tasks_from_configs(configs)
+            LightevalTask.load_datasets(task_dict)
+        if captured_calls:
+            payload = {"calls": dedupe_calls(captured_calls)}
+            task_cache_set_payload("lighteval", raw, payload)
+        task_cache_mark_resolved("lighteval", raw)
+
+
+@contextmanager
+def capture_third_party_output(verbose: bool = False):
+    """
+    Suppresses print/logging.info/logging.debug originating from non-project modules
+    unless verbose=True.
+
+    A call is considered "third-party" if its immediate caller's file path is not
+    under the repository root (parent of the `oellm` package directory).
+    """
+    if verbose:
+        yield
+        return
+
+    package_root = Path(__file__).resolve().parent
+
+    def is_internal_stack(skip: int = 2, max_depth: int = 12) -> bool:
+        f = sys._getframe(skip)
+        depth = 0
+        while f and depth < max_depth:
+            filename = f.f_code.co_filename if f.f_code else ""
+            if filename:
+                p = Path(filename).resolve()
+                if p.is_relative_to(package_root):
+                    return True
+            f = f.f_back
+            depth += 1
+        return False
+
+    orig_print = builtins.print
+    orig_logger_info = logging.Logger.info
+    orig_logger_debug = logging.Logger.debug
+    orig_module_info = logging.info
+    orig_module_debug = logging.debug
+
+    def filtered_print(*args, **kwargs):
+        if is_internal_stack():
+            return orig_print(*args, **kwargs)
+        # third-party: drop
+        return None
+
+    def filtered_logger_info(self, msg, *args, **kwargs):
+        if is_internal_stack():
+            return orig_logger_info(self, msg, *args, **kwargs)
+        return None
+
+    def filtered_logger_debug(self, msg, *args, **kwargs):
+        if is_internal_stack():
+            return orig_logger_debug(self, msg, *args, **kwargs)
+        return None
+
+    def filtered_module_info(msg, *args, **kwargs):
+        if is_internal_stack():
+            return orig_module_info(msg, *args, **kwargs)
+        return None
+
+    def filtered_module_debug(msg, *args, **kwargs):
+        if is_internal_stack():
+            return orig_module_debug(msg, *args, **kwargs)
+        return None
+
+    builtins.print = filtered_print
+    logging.Logger.info = filtered_logger_info  # type: ignore[assignment]
+    logging.Logger.debug = filtered_logger_debug  # type: ignore[assignment]
+    logging.info = filtered_module_info  # type: ignore[assignment]
+    logging.debug = filtered_module_debug  # type: ignore[assignment]
+
+    try:
+        yield
+    finally:
+        builtins.print = orig_print
+        logging.Logger.info = orig_logger_info  # type: ignore[assignment]
+        logging.Logger.debug = orig_logger_debug  # type: ignore[assignment]
+        logging.info = orig_module_info  # type: ignore[assignment]
+        logging.debug = orig_module_debug  # type: ignore[assignment]
+
+
+def capture_third_party_output_from_kwarg(
+    verbose_kwarg: str = "verbose", default: bool = False
+):
+    """
+    Decorator factory that wraps the function execution inside
+    capture_third_party_output(verbose=kwargs.get(verbose_kwarg, default)).
+    """
+
+    def _decorator(func):
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            verbose_value = bool(kwargs.get(verbose_kwarg, default))
+            with capture_third_party_output(verbose=verbose_value):
+                return func(*args, **kwargs)
+
+        return _wrapper
+
+    return _decorator
diff --git a/pyproject.toml b/pyproject.toml
index 2b7a64a..d699cba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,12 @@ dependencies = [
     "questionary",
 ]
 
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.4.1",
+    "pre-commit",
+]
+
 [project.scripts]
 oellm = "oellm.main:main"
 
@@ -28,7 +34,7 @@ build-backend = "uv_build"
 [tool.uv.build-backend]
 module-name = "oellm"
 module-root = ""
-include = ["oellm/clusters.yaml", "oellm/task-groups.yaml"]
+include = ["oellm/resources/*"]
 
 [tool.uv.sources]
 torch = [
@@ -72,8 +78,3 @@ quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
-
-[dependency-groups]
-dev = [
-    "pytest>=8.4.1",
-]

From a97d92dde63dbab52d0e43f4209a3406ad92df2c Mon Sep 17 00:00:00 2001
From: "Timur M. Carstensen"
 <40788422+timurcarstensen@users.noreply.github.com>
Date: Tue, 21 Oct 2025 21:33:19 +0200
Subject: [PATCH 15/39] Update README.md

Co-authored-by: David Salinas <geoalgo@users.noreply.github.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cdc3f89..328c1aa 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ This will launch an interactive workflow where you can:
 - Configure n-shot settings
 - Preview and save your evaluation configuration
 
-The resulting CSV now includes an additional `eval_suite` column that records which
+The resulting CSV includes an additional `eval_suite` column that records which
 evaluation framework (e.g., `lm_eval` or `lighteval`) should be used for each
 task.
 

From c9db766e60f09e548a1eb8530fc8325765a93caf Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 22:35:40 +0300
Subject: [PATCH 16/39] misc

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index cdc3f89..e00b278 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,13 @@
 # OpenEuroLLM CLI (oellm)
 
-A package for running OELLM CLI workflows across multiple HPC clusters using SLURM job arrays and Singularity containers. 
+A package for running OELLM CLI workflows across multiple HPC clusters using SLURM job arrays and Singularity containers.
 
 ## Currently supported workflows
 - Schedule evaluations on multiple models and tasks on all clusters ✅ `oellm schedule-eval ...`
 - Restart failed evaluations (e.g., due to node failures) ✅ `oellm collect-results ... --reschedule true`
 - Interactive eval job/csv builder ✅ `oellm build-csv`
   - Recursively resolve local paths: pass a directory containing models and their nested intermediate checkpoints, will eval all checkpoints
-  - Support default task groups (cf `oellm/task-groups.yaml`)
+  - Support default task groups (cf `oellm/resources/task-groups.yaml`)
 
 ## Planned workflows
 - Sync and download evaluation results from all clusters via a shared data layer
@@ -36,7 +36,7 @@ This will automatically:
 - Generate a SLURM job array to evaluate all model-task combinations
 - Submit the jobs with appropriate cluster-specific resource allocations
 
-In case you meet HuggingFace quotas issues, make sure you are logged in by setting your `HF_TOKEN` and that you are part of [OpenEuroLLM](https://huggingface.co/OpenEuroLLM) organization. 
+In case you meet HuggingFace quotas issues, make sure you are logged in by setting your `HF_TOKEN` and that you are part of [OpenEuroLLM](https://huggingface.co/OpenEuroLLM) organization.
 
 ## Interactive CSV Builder
 
@@ -108,7 +108,7 @@ The `oellm` package orchestrates distributed LLM evaluations through the followi
 
 ### 1. **Cluster Auto-Detection**
 - Automatically detects the current HPC cluster based on hostname patterns
-- Loads cluster-specific configurations from [`clusters.yaml`](oellm/clusters.yaml) including:
+- Loads cluster-specific configurations from [`clusters.yaml`](oellm/resources/clusters.yaml) including:
   - SLURM partition and account settings
   - Shared storage paths for models, datasets, and results
   - GPU allocation and queue limits

From 10b26ff0ebeae019f33f6423d02b4a352669082d Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 22:37:11 +0300
Subject: [PATCH 17/39] temporarily adding AGENTS>md for development

---
 AGENTS.md | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..7e64337
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,5 @@
+Rules:
+- no try...Except unless absolutely necessary
+- no unnecessary comments
+- don't worry about tests
+- if you need to run stuff, assume there is a .venv at the root of the project. you can also just use uv

From e8e3b38a96ab5f65dac3543fa52510562a2f03af Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Tue, 21 Oct 2025 23:43:28 +0200
Subject: [PATCH 18/39] fix: task caching for lighteval

---
 oellm/resources/clusters.yaml    |  1 +
 oellm/resources/task-groups.yaml |  4 +-
 oellm/task_cache.py              | 71 +++++++++++++++----------
 oellm/utils.py                   | 91 +++++++++++++++-----------------
 4 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index d6da6d2..738c25a 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -6,6 +6,7 @@ shared:
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
   HF_HUB_DISABLE_PROGRESS_BARS: "1"
+  HF_DATASETS_DISABLE_PROGRESS_BARS: "1"
 
 leonardo:
   hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index f9b7684..ee081f2 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -140,6 +140,6 @@ super_groups:
     description: "Combined Belebele EU set plus multilingual benchmarks"
     task_groups:
       - task: flores-200-eu-to-eng
-      # - task: flores-200-eng-to-eu
+      - task: flores-200-eng-to-eu
       - task: belebele-eu-5-shot
-      # - task: global-mmlu-eu
+      - task: global-mmlu-eu
diff --git a/oellm/task_cache.py b/oellm/task_cache.py
index d8be806..2fdeea0 100644
--- a/oellm/task_cache.py
+++ b/oellm/task_cache.py
@@ -1,12 +1,18 @@
 import json
 import logging
 from contextlib import contextmanager
+from contextvars import ContextVar
 from datetime import datetime
 from pathlib import Path
 
 TASK_CACHE_TTL_DAYS = 30
 
 
+_CURRENT_CAPTURE_BUFFER: ContextVar[list[dict] | None] = ContextVar(
+    "_CURRENT_CAPTURE_BUFFER", default=None
+)
+
+
 def get_task_cache_file() -> Path:
     return Path(__file__).resolve().parent / "resources" / "task_map_cache.json"
 
@@ -147,6 +153,7 @@ def dedupe_calls(calls: list[dict]) -> list[dict]:
 @contextmanager
 def capture_hf_dataset_calls():
     captured: list[dict] = []
+    _buffer_token = _CURRENT_CAPTURE_BUFFER.set(captured)
 
     import datasets as _ds  # type: ignore
     import huggingface_hub as _hfh  # type: ignore
@@ -173,17 +180,19 @@ def _load_dataset_proxy(path, *args, **kwargs):  # noqa: ANN001
         )
         trust_remote_code = kwargs.get("trust_remote_code")
         revision = kwargs.get("revision")
-        captured.append(
-            {
-                "type": "load_dataset",
-                "path": path,
-                "name": name,
-                "data_files": data_files,
-                "split": split,
-                "revision": revision,
-                "trust_remote_code": trust_remote_code,
-            }
-        )
+        buf = _CURRENT_CAPTURE_BUFFER.get()
+        if isinstance(buf, list):
+            buf.append(
+                {
+                    "type": "load_dataset",
+                    "path": path,
+                    "name": name,
+                    "data_files": data_files,
+                    "split": split,
+                    "revision": revision,
+                    "trust_remote_code": trust_remote_code,
+                }
+            )
         return _orig_load_dataset(path, *args, **kwargs)
 
     def _snapshot_download_proxy(*args, **kwargs):  # noqa: ANN001
@@ -202,14 +211,16 @@ def _snapshot_download_proxy(*args, **kwargs):  # noqa: ANN001
             if "revision" in kwargs
             else (args[2] if len(args) > 2 else None)
         )
-        captured.append(
-            {
-                "type": "snapshot_download",
-                "repo_id": repo_id,
-                "repo_type": repo_type,
-                "revision": revision,
-            }
-        )
+        buf = _CURRENT_CAPTURE_BUFFER.get()
+        if isinstance(buf, list):
+            buf.append(
+                {
+                    "type": "snapshot_download",
+                    "repo_id": repo_id,
+                    "repo_type": repo_type,
+                    "revision": revision,
+                }
+            )
         return _orig_snapshot_download(*args, **kwargs)
 
     def _hf_hub_download_proxy(*args, **kwargs):  # noqa: ANN001
@@ -233,26 +244,30 @@ def _hf_hub_download_proxy(*args, **kwargs):  # noqa: ANN001
             if "revision" in kwargs
             else (args[3] if len(args) > 3 else None)
         )
-        captured.append(
-            {
-                "type": "hf_hub_download",
-                "repo_id": repo_id,
-                "filename": filename,
-                "repo_type": repo_type,
-                "revision": revision,
-            }
-        )
+        buf = _CURRENT_CAPTURE_BUFFER.get()
+        if isinstance(buf, list):
+            buf.append(
+                {
+                    "type": "hf_hub_download",
+                    "repo_id": repo_id,
+                    "filename": filename,
+                    "repo_type": repo_type,
+                    "revision": revision,
+                }
+            )
         return _orig_hf_hub_download(*args, **kwargs)
 
     _ds.load_dataset = _load_dataset_proxy  # type: ignore[assignment]
     _hfh.snapshot_download = _snapshot_download_proxy  # type: ignore[assignment]
     _hfh.hf_hub_download = _hf_hub_download_proxy  # type: ignore[assignment]
+
     try:
         yield captured
     finally:
         _ds.load_dataset = _orig_load_dataset  # type: ignore[assignment]
         _hfh.snapshot_download = _orig_snapshot_download  # type: ignore[assignment]
         _hfh.hf_hub_download = _orig_hf_hub_download  # type: ignore[assignment]
+        _CURRENT_CAPTURE_BUFFER.reset(_buffer_token)
 
 
 def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True) -> None:
diff --git a/oellm/utils.py b/oellm/utils.py
index 63927a5..dac7a49 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -320,68 +320,57 @@ def _pre_download_task_datasets(
 
 
 def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
+    seen: set[str] = set()
     misses: list[str] = []
-    processed: set[str] = set()
-    for t in tasks:
-        raw = str(t).strip()
-        if not raw or raw in processed:
+    tasks = [str(task).strip() for task in tasks]
+    for task in tasks:
+        if not task or task in seen:
             continue
-        processed.add(raw)
-        if task_cache_lookup("lighteval", raw):
+        seen.add(task)
+        if task_cache_lookup("lighteval", task):
             logging.info(
-                f"Skipping dataset preparation for LightEval task '{raw}' (cache hit within TTL)."
+                f"Skipping dataset preparation for task '{task}' (cache hit within TTL)."
             )
             continue
-        misses.append(raw)
+        misses.append(task)
 
     if not misses:
-        for raw in processed:
-            if task_cache_lookup("lighteval", raw):
+        for task in seen:
+            if task_cache_lookup("lighteval", task):
                 prewarm_from_payload(
-                    task_cache_get_payload("lighteval", raw),
+                    task_cache_get_payload("lighteval", task),
                     trust_remote_code=True,
                 )
         return
 
-    from lighteval.tasks.lighteval_task import LightevalTask  # type: ignore
-    from lighteval.tasks.registry import (  # type: ignore
-        TRUNCATE_FEW_SHOTS_DEFAULTS,
-        Registry,
-    )
+    for task in misses:
+        with capture_hf_dataset_calls() as captured_calls:
+            from lighteval.tasks.lighteval_task import LightevalTask
+            from lighteval.tasks.registry import (
+                TRUNCATE_FEW_SHOTS_DEFAULTS,
+                Registry,
+            )
 
-    for raw in misses:
-        candidate = Path(raw)
-        if candidate.exists() and candidate.is_file():
-            with capture_hf_dataset_calls() as captured_calls:
-                reg_file = Registry()
-                configs_file = reg_file.get_tasks_configs(str(candidate))
-                task_dict_file = reg_file.get_tasks_from_configs(configs_file)
-                LightevalTask.load_datasets(task_dict_file)
-            if captured_calls:
-                payload = {"calls": dedupe_calls(captured_calls)}
-                task_cache_set_payload("lighteval", raw, payload)
-            task_cache_mark_resolved("lighteval", raw)
-            continue
+            reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
+            truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
 
-        # Build single-spec string and load in isolation
-        spec = raw
-        truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
-        if "|" not in spec:
-            spec = f"lighteval|{spec}|0|{truncate_default}"
-        elif spec.count("|") == 1:
-            spec = f"{spec}|0|{truncate_default}"
-        elif spec.count("|") == 2:
-            spec = f"{spec}|{truncate_default}"
+            spec = task
+            if "|" not in spec:
+                spec = f"lighteval|{spec}|0|{truncate_default}"
+            elif spec.count("|") == 1:
+                spec = f"{spec}|0|{truncate_default}"
+            elif spec.count("|") == 2:
+                spec = f"{spec}|{truncate_default}"
 
-        with capture_hf_dataset_calls() as captured_calls:
-            reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
             configs = reg.get_tasks_configs(spec)
             task_dict = reg.get_tasks_from_configs(configs)
             LightevalTask.load_datasets(task_dict)
-        if captured_calls:
-            payload = {"calls": dedupe_calls(captured_calls)}
-            task_cache_set_payload("lighteval", raw, payload)
-        task_cache_mark_resolved("lighteval", raw)
+
+        payload = (
+            {"calls": dedupe_calls(captured_calls)} if captured_calls else {"calls": []}
+        )
+        task_cache_set_payload("lighteval", task, payload)
+        task_cache_mark_resolved("lighteval", task)
 
 
 @contextmanager
@@ -399,15 +388,21 @@ def capture_third_party_output(verbose: bool = False):
 
     package_root = Path(__file__).resolve().parent
 
-    def is_internal_stack(skip: int = 2, max_depth: int = 12) -> bool:
+    def is_internal_stack(skip: int = 2, max_depth: int = 20) -> bool:
         f = sys._getframe(skip)
         depth = 0
         while f and depth < max_depth:
-            filename = f.f_code.co_filename if f.f_code else ""
+            code = f.f_code
+            filename = code.co_filename if code else ""
             if filename:
                 p = Path(filename).resolve()
-                if p.is_relative_to(package_root):
-                    return True
+                name = code.co_name if code else ""
+                # Skip logging internals and our filtering wrappers to find the real caller
+                if "/logging/__init__.py" in filename or name.startswith("filtered_"):
+                    f = f.f_back
+                    depth += 1
+                    continue
+                return p.is_relative_to(package_root)
             f = f.f_back
             depth += 1
         return False

From d8c8ed513ccf4b7a6ba1e92000edd1207544139c Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 00:55:45 +0300
Subject: [PATCH 19/39] fix

---
 .github/workflows/build-and-push-apptainer.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml
index 197816e..5d3db37 100644
--- a/.github/workflows/build-and-push-apptainer.yml
+++ b/.github/workflows/build-and-push-apptainer.yml
@@ -18,7 +18,7 @@ jobs:
       matrix:
         image: [jureca, leonardo, lumi]
     name: Build & Publish SIF Artifact (${{ matrix.image }})
-    runs-on: 
+    runs-on:
       - runs-on=${{github.run_id}}/family=i7ie
     permissions:
       contents: read
@@ -26,7 +26,7 @@ jobs:
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
-      
+
       - name: Install Apptainer
         run: |
           sudo apt-get update
@@ -37,7 +37,7 @@ jobs:
 
       - name: Build SIF from definition file
         run: |
-          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
+          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 6" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
 
       - name: Install Hugging Face Hub CLI
         run: pip install --upgrade "huggingface_hub"
@@ -45,7 +45,7 @@ jobs:
       - name: Login to Hugging Face Hub
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        run: hf auth login --token "$HF_TOKEN"  
+        run: hf auth login --token "$HF_TOKEN"
 
       - name: Upload SIF to Hugging Face Hub
         env:

From d37b5327074ad10a359d7ad95bd6982eed45576e Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 11:03:43 +0300
Subject: [PATCH 20/39] fix: compression algorithm

---
 .github/workflows/build-and-push-apptainer.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml
index 5d3db37..0d1c8dd 100644
--- a/.github/workflows/build-and-push-apptainer.yml
+++ b/.github/workflows/build-and-push-apptainer.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Build SIF from definition file
         run: |
-          apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 6" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
+          apptainer --verbose build --mksquashfs-args="-comp gzip -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def
 
       - name: Install Hugging Face Hub CLI
         run: pip install --upgrade "huggingface_hub"

From 79ace47c7db41a1b51d6f6cfa0a78a66c6a898e5 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 11:22:14 +0200
Subject: [PATCH 21/39] fix: updated apptainer definitions to include correct
 uv install

---
 apptainer/jureca.def   | 26 ++++++++++++++------------
 apptainer/leonardo.def | 25 +++++++++++++------------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 23cd237..fe190ba 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -2,27 +2,29 @@ Bootstrap: docker
 From: nvcr.io/nvidia/pytorch:25.06-py3
 
 %labels
-    Author      multi-cluster-eval
-    Description Apptainer image for JURECA cluster (converted from dockerfile)
+    Author      oellm-cli
+    Description Apptainer image for JURECA JSC cluster
 
 %post
-    # 1. Install uv package manager
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile
+    # Install uv into a global bin
+    curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
 
-    # Make uv visible for subsequent commands during build
-    export PATH=/root/.local/bin:$PATH
+    # Put uv-installed tool shims in a global bin too
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    uv --version
 
-    # 2. Install Python dependencies
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
-    # Install LightEval CLI in an isolated environment
+    # Optional: keep tool envs under /opt to avoid $HOME
+    export UV_TOOL_DIR=/opt/uv-tools
     uv tool install "lighteval[multilingual]"
-
+    
 %environment
-    # Ensure uv is present inside the container runtime as well
-    export PATH=/root/.local/bin:$PATH
+    export PATH=/usr/local/bin:$PATH
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    export UV_TOOL_DIR=/opt/uv-tools
+
 
 %runscript
     exec bash "$@" 
\ No newline at end of file
diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def
index 27f0eca..79f69f9 100644
--- a/apptainer/leonardo.def
+++ b/apptainer/leonardo.def
@@ -2,27 +2,28 @@ Bootstrap: docker
 From: nvcr.io/nvidia/pytorch:25.06-py3
 
 %labels
-    Author      multi-cluster-eval
-    Description Apptainer image for Leonardo cluster (converted from dockerfile)
+    Author      oellm-cli
+    Description Apptainer image for Leonardo cluster
 
 %post
-    # 1. Install uv package manager
-    curl -LsSf https://astral.sh/uv/install.sh | sh
-    echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile
+    # Install uv into a global bin
+    curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
 
-    # Make uv visible for subsequent commands during build
-    export PATH=/root/.local/bin:$PATH
+    # Put uv-installed tool shims in a global bin too
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    uv --version
 
-    # 2. Install Python dependencies
     uv pip install --system --break-system-packages lm-eval \
         "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate
 
-    # Install LightEval CLI in an isolated environment
+    # Optional: keep tool envs under /opt to avoid $HOME
+    export UV_TOOL_DIR=/opt/uv-tools
     uv tool install "lighteval[multilingual]"
-
+    
 %environment
-    # Ensure uv is present inside the container runtime as well
-    export PATH=/root/.local/bin:$PATH
+    export PATH=/usr/local/bin:$PATH
+    export UV_TOOL_BIN_DIR=/usr/local/bin
+    export UV_TOOL_DIR=/opt/uv-tools
 
 %runscript
     exec bash "$@" 
\ No newline at end of file

From 13e985c51b3aec385b2acf6dee6872606daf5a45 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 11:35:30 +0200
Subject: [PATCH 22/39] fix: lighteval cli args

---
 oellm/resources/template.sbatch | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index b68d637..d02a93c 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -135,8 +135,8 @@ do
                 lighteval accelerate \
                     "model_name=$model_path,trust_remote_code=True" \
                     "$LIGHT_TASK_ARG" \
-                    --output_dir "$RESULTS_SUBDIR" \
-                    --save_details
+                    --output-dir "$RESULTS_SUBDIR" \
+                    --save-details
             ;;
         *)
             echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping."

From c9160d5eb5700fa9034609941f634f8bbbdeb04e Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 12:53:24 +0200
Subject: [PATCH 23/39] feat: wrapper to suppress tqdm output

---
 oellm/resources/clusters.yaml |  4 +-
 oellm/utils.py                | 73 +++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index 738c25a..a963dad 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -5,8 +5,8 @@ shared:
   HF_HOME: "{EVAL_BASE_DIR}/hf_data"  # where HuggingFace models and datasets are stored
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
-  HF_HUB_DISABLE_PROGRESS_BARS: "1"
-  HF_DATASETS_DISABLE_PROGRESS_BARS: "1"
+  HF_HUB_DISABLE_PROGRESS_BARS: "0"
+  HF_DATASETS_DISABLE_PROGRESS_BARS: "0"
 
 leonardo:
   hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
diff --git a/oellm/utils.py b/oellm/utils.py
index dac7a49..90d0660 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -26,11 +26,81 @@
 )
 
 
+@contextmanager
+def suppress_tqdm_rendering(enabled: bool = True):
+    """
+    Temporarily suppresses tqdm progress bar rendering when enabled=True.
+
+    This prevents any visual rendering by overriding the class methods
+    responsible for output, without altering other behavior.
+    """
+    if not enabled:
+        yield
+        return
+
+    import tqdm as _tqdm
+    from tqdm import auto as _tqdm_auto
+
+    classes = [_tqdm.tqdm, _tqdm_auto.tqdm]
+    seen: set[int] = set()
+    patched: list[tuple[object, str, object]] = []
+
+    for cls in classes:
+        cid = id(cls)
+        if cid in seen:
+            continue
+        seen.add(cid)
+
+        if hasattr(cls, "display"):
+            orig_display = cls.display  # type: ignore[attr-defined]
+
+            def _noop_display(self, *args, **kwargs):
+                return None
+
+            cls.display = _noop_display  # type: ignore[assignment]
+            patched.append((cls, "display", orig_display))
+
+        if hasattr(cls, "refresh"):
+            orig_refresh = cls.refresh  # type: ignore[attr-defined]
+
+            def _noop_refresh(self, *args, **kwargs):
+                return None
+
+            cls.refresh = _noop_refresh  # type: ignore[assignment]
+            patched.append((cls, "refresh", orig_refresh))
+
+    try:
+        yield
+    finally:
+        for cls, name, orig in patched:
+            setattr(cls, name, orig)
+
+
+def filter_tqdm(enabled: bool = True):
+    """
+    Decorator factory to suppress tqdm rendering for the wrapped function
+    when enabled=True.
+    """
+
+    def _decorator(func):
+        @wraps(func)
+        def _wrapper(*args, **kwargs):
+            with suppress_tqdm_rendering(enabled=enabled):
+                return func(*args, **kwargs)
+
+        return _wrapper
+
+    return _decorator
+
+
+@filter_tqdm(enabled=False)
 def _ensure_singularity_image(image_name: str) -> None:
     from huggingface_hub import hf_hub_download
 
     image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name
 
+    logging.info(f"Downloading latest Singularity image from HuggingFace: {image_name}")
+
     try:
         hf_hub_download(
             repo_id="openeurollm/evaluation_singularity_images",
@@ -187,6 +257,7 @@ def _expand_local_model_paths(model: str) -> list[Path]:
     return model_paths
 
 
+@filter_tqdm(enabled=True)
 def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
     """
     Processes model strings into a dict of model paths.
@@ -251,6 +322,7 @@ def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
     return processed_model_paths
 
 
+@filter_tqdm(enabled=True)
 def _pre_download_task_datasets(
     tasks: Iterable[str], trust_remote_code: bool = True
 ) -> None:
@@ -319,6 +391,7 @@ def _pre_download_task_datasets(
         logging.debug(f"Finished dataset preparation for task '{task_name}'.")
 
 
+@filter_tqdm(enabled=True)
 def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
     seen: set[str] = set()
     misses: list[str] = []

From ccf4c5a322800529fb6096e8abf0e31f56f977b9 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 13:18:42 +0200
Subject: [PATCH 24/39] misc

---
 apptainer/jureca.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index fe190ba..400c42b 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
 
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
-    uv tool install "lighteval[multilingual]"
+    uv tool install "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
     
 %environment
     export PATH=/usr/local/bin:$PATH

From 97b3d6953c6c65753ab3300deb1aaf8d4c876669 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 13:49:54 +0200
Subject: [PATCH 25/39] fix: lighteval tool python version

---
 apptainer/jureca.def   | 2 +-
 apptainer/leonardo.def | 2 +-
 apptainer/lumi.def     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 400c42b..68be6eb 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
 
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
-    uv tool install "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
+    uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
     
 %environment
     export PATH=/usr/local/bin:$PATH
diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def
index 79f69f9..14a9576 100644
--- a/apptainer/leonardo.def
+++ b/apptainer/leonardo.def
@@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
 
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
-    uv tool install "lighteval[multilingual]"
+    uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
     
 %environment
     export PATH=/usr/local/bin:$PATH
diff --git a/apptainer/lumi.def b/apptainer/lumi.def
index a7d71d7..2f7e8c4 100644
--- a/apptainer/lumi.def
+++ b/apptainer/lumi.def
@@ -18,7 +18,7 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
 
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
-    uv tool install "lighteval[multilingual]"
+    uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
     
 %environment
     export PATH=/usr/local/bin:$PATH

From 541d3871c9fd213ba9423976e341c465d82d93b3 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 15:32:16 +0200
Subject: [PATCH 26/39] nltk setup

---
 apptainer/jureca.def | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 68be6eb..dc53dcc 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -19,11 +19,20 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
     uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
+    uv pip install --system --break-system-packages nltk
+    mkdir -p /opt/nltk_data
+    python - <<'PY'
+        import nltk
+        nltk.download('punkt', download_dir='/opt/nltk_data')
+        nltk.download('punkt_tab', download_dir='/opt/nltk_data')
+        print('nltk data downloaded')
+    PY
     
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
     export UV_TOOL_DIR=/opt/uv-tools
+    export NLTK_DATA=/opt/nltk_data
 
 
 %runscript

From 006ab8d84e830042bb898d2151aec2b2cdbeffe8 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 15:38:31 +0200
Subject: [PATCH 27/39] nltk setup

---
 apptainer/jureca.def | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index dc53dcc..915906a 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -22,11 +22,11 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     uv pip install --system --break-system-packages nltk
     mkdir -p /opt/nltk_data
     python - <<'PY'
-        import nltk
-        nltk.download('punkt', download_dir='/opt/nltk_data')
-        nltk.download('punkt_tab', download_dir='/opt/nltk_data')
-        print('nltk data downloaded')
-    PY
+import nltk
+nltk.download('punkt', download_dir='/opt/nltk_data')
+nltk.download('punkt_tab', download_dir='/opt/nltk_data')
+print('nltk data downloaded')
+PY
     
 %environment
     export PATH=/usr/local/bin:$PATH

From 15bea15c6b4b4dba88baa4ade8573eb17253e946 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 17:44:20 +0200
Subject: [PATCH 28/39] fix: downloading nltk data for lighteval during
 container setup

---
 apptainer/jureca.def   | 1 -
 apptainer/leonardo.def | 8 ++++++++
 apptainer/lumi.def     | 8 ++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 915906a..28a0391 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -25,7 +25,6 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
 import nltk
 nltk.download('punkt', download_dir='/opt/nltk_data')
 nltk.download('punkt_tab', download_dir='/opt/nltk_data')
-print('nltk data downloaded')
 PY
     
 %environment
diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def
index 14a9576..c9b2d74 100644
--- a/apptainer/leonardo.def
+++ b/apptainer/leonardo.def
@@ -19,11 +19,19 @@ From: nvcr.io/nvidia/pytorch:25.06-py3
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
     uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
+    uv pip install --system --break-system-packages nltk
+    mkdir -p /opt/nltk_data
+    python - <<'PY'
+import nltk
+nltk.download('punkt', download_dir='/opt/nltk_data')
+nltk.download('punkt_tab', download_dir='/opt/nltk_data')
+PY
     
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
     export UV_TOOL_DIR=/opt/uv-tools
+    export NLTK_DATA=/opt/nltk_data
 
 %runscript
     exec bash "$@" 
\ No newline at end of file
diff --git a/apptainer/lumi.def b/apptainer/lumi.def
index 2f7e8c4..815d3a1 100644
--- a/apptainer/lumi.def
+++ b/apptainer/lumi.def
@@ -19,11 +19,19 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
     # Optional: keep tool envs under /opt to avoid $HOME
     export UV_TOOL_DIR=/opt/uv-tools
     uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978"
+    uv pip install --system --break-system-packages nltk
+    mkdir -p /opt/nltk_data
+    python - <<'PY'
+import nltk
+nltk.download('punkt', download_dir='/opt/nltk_data')
+nltk.download('punkt_tab', download_dir='/opt/nltk_data')
+PY
     
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
     export UV_TOOL_DIR=/opt/uv-tools
+    export NLTK_DATA=/opt/nltk_data
 
 %runscript
     exec bash "$@" 
\ No newline at end of file

From 9c97d25810689e4cdb3578a5cb1ed0574ccd5b56 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 17:44:54 +0200
Subject: [PATCH 29/39] suppressing all tqdm progress bars

---
 oellm/resources/clusters.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index a963dad..738c25a 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -5,8 +5,8 @@ shared:
   HF_HOME: "{EVAL_BASE_DIR}/hf_data"  # where HuggingFace models and datasets are stored
   EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}"  # where evaluations are written
   GPUS_PER_NODE: 1
-  HF_HUB_DISABLE_PROGRESS_BARS: "0"
-  HF_DATASETS_DISABLE_PROGRESS_BARS: "0"
+  HF_HUB_DISABLE_PROGRESS_BARS: "1"
+  HF_DATASETS_DISABLE_PROGRESS_BARS: "1"
 
 leonardo:
   hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML

From f11d4a4c8748407495ad490ed419170c8fe988d8 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 17:46:00 +0200
Subject: [PATCH 30/39] lighteval fixes

---
 oellm/resources/template.sbatch | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index d02a93c..16cf705 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -122,7 +122,7 @@ do
                         LIGHT_TASK_ARG="$LIGHT_TASK"
                     fi
                 else
-                    LIGHT_TASK_ARG="${{LIGHT_TASK}}|$n_shot"
+                    LIGHT_TASK_ARG="lighteval|${{LIGHT_TASK}}|$n_shot|0"
                 fi
             fi
 
@@ -135,8 +135,8 @@ do
                 lighteval accelerate \
                     "model_name=$model_path,trust_remote_code=True" \
                     "$LIGHT_TASK_ARG" \
-                    --output-dir "$RESULTS_SUBDIR" \
-                    --save-details
+                    --custom-tasks lighteval.tasks.multilingual.tasks \
+                    --output-dir "$RESULTS_SUBDIR"
             ;;
         *)
             echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping."

From 096cbc07c2726eac1c9168ba385bf8b48b577f01 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 17:49:46 +0200
Subject: [PATCH 31/39] misc

---
 oellm/resources/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 oellm/resources/__init__.py

diff --git a/oellm/resources/__init__.py b/oellm/resources/__init__.py
deleted file mode 100644
index e69de29..0000000

From 6e888d7bf4b98a50cf27582b0aa14d4b03121f98 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 17:49:59 +0200
Subject: [PATCH 32/39] feat: aya-expanse tasks

---
 oellm/resources/task-groups.yaml | 52 ++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index ee081f2..957581e 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -134,6 +134,55 @@ task_groups:
       - task: global_mmlu_full_tr
       - task: global_mmlu_full_uk
       - task: global_mmlu_full_he
+  mgsm-eu:
+    description: "EU Language GSM benchmarks in Aya Expanse"
+    suite: lm-eval-harness
+    n_shots: [5]
+    tasks:
+      - task: mgsm_native_cot_en
+      - task: mgsm_native_cot_de
+      - task: mgsm_native_cot_es
+      - task: mgsm_native_cot_fr
+  
+  generic-multilingual:
+    description: "Generic multilingual benchmarks in Aya Expanse"
+    suite: lm-eval-harness
+    n_shots: [0]
+    tasks:
+      - task: xwinograd
+      - task: xcopa
+      - task: xstorycloze
+
+  include: 
+    description: "INCLUDE benchmarks in Aya Expanse"
+    suite: lm-eval-harness
+    n_shots: [0]
+    tasks:
+      - task: include_base_44_albanian
+      - task: include_base_44_armenian
+      - task: include_base_44_azerbaijani
+      - task: include_base_44_basque
+      - task: include_base_44_belarusian
+      - task: include_base_44_bulgarian
+      - task: include_base_44_croatian
+      - task: include_base_44_dutch
+      - task: include_base_44_estonian
+      - task: include_base_44_finnish
+      - task: include_base_44_french
+      - task: include_base_44_georgian
+      - task: include_base_44_german
+      - task: include_base_44_greek
+      - task: include_base_44_hungarian
+      - task: include_base_44_italian
+      - task: include_base_44_lithuanian
+      - task: include_base_44_north macedonian
+      - task: include_base_44_polish
+      - task: include_base_44_portuguese
+      - task: include_base_44_russian
+      - task: include_base_44_serbian
+      - task: include_base_44_spanish
+      - task: include_base_44_turkish
+      - task: include_base_44_ukrainian
 
 super_groups:
   oellm-multilingual:
@@ -143,3 +192,6 @@ super_groups:
       - task: flores-200-eng-to-eu
       - task: belebele-eu-5-shot
       - task: global-mmlu-eu
+      - task: mgsm-eu
+      - task: generic-multilingual
+      - task: include

From 9d87217e3ef2496674cf58093661d1b6139e8d90 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 20:51:07 +0200
Subject: [PATCH 33/39] chore: schedule-eval logic cleanup

---
 oellm/main.py        | 263 ++++++++++++++++---------------------------
 oellm/task_cache.py  |  10 +-
 oellm/task_groups.py |  20 +++-
 3 files changed, 117 insertions(+), 176 deletions(-)

diff --git a/oellm/main.py b/oellm/main.py
index e04d87d..92bd230 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -1,8 +1,8 @@
 import logging
 import os
 import re
-import shutil
 import subprocess
+from dataclasses import dataclass
 from datetime import datetime
 from importlib.resources import files
 from pathlib import Path
@@ -17,6 +17,7 @@
 from oellm.utils import (
     _ensure_singularity_image,
     _expand_local_model_paths,
+    _filter_warnings,
     _load_cluster_env,
     _num_jobs_in_queue,
     _pre_download_lighteval_datasets,
@@ -27,6 +28,14 @@
 )
 
 
+@dataclass
+class EvaluationJob:
+    model_path: Path | str
+    task_path: str
+    n_shot: int
+    eval_suite: str
+
+
 @capture_third_party_output_from_kwarg("verbose")
 def schedule_evals(
     models: str | None = None,
@@ -75,16 +84,20 @@ def schedule_evals(
     _load_cluster_env()
 
     if not skip_checks:
-        image_name = os.environ.get("EVAL_CONTAINER_IMAGE")
-        if image_name is None:
-            raise ValueError(
-                "EVAL_CONTAINER_IMAGE is not set. Please set it in clusters.yaml."
-            )
-
-        _ensure_singularity_image(image_name)
+        _ensure_singularity_image(os.environ.get("EVAL_CONTAINER_IMAGE"))  # type: ignore
     else:
         logging.info("Skipping container image check (--skip-checks enabled)")
 
+    if isinstance(models, str) and models is not None:
+        models = [m.strip() for m in models.split(",") if m.strip()]  # type: ignore
+
+    if isinstance(tasks, str) and tasks is not None:
+        tasks = [t.strip() for t in tasks.split(",") if t.strip()]  # type: ignore
+
+    if isinstance(n_shot, int) and n_shot is not None:
+        n_shot = [n_shot]
+
+    eval_jobs: list[EvaluationJob] = []
     if eval_csv_path:
         if models or tasks or task_groups or n_shot:
             raise ValueError(
@@ -104,133 +117,97 @@ def schedule_evals(
 
         # Always expand local model paths, even with skip_checks
         df["model_path"].unique()
-        expanded_rows = []
-        for _, row in df.iterrows():
-            original_model_path = row["model_path"]
-            local_paths = _expand_local_model_paths(original_model_path)
-            if local_paths:
-                # Use expanded local paths
-                for expanded_path in local_paths:
-                    new_row = row.copy()
-                    new_row["model_path"] = expanded_path
-                    expanded_rows.append(new_row)
-            else:
-                # Keep original path (might be HF model)
-                expanded_rows.append(row)
-        df = pd.DataFrame(expanded_rows)
-
-        if "eval_suite" not in df.columns:
-            df["eval_suite"] = "lm_eval"
-
-        # Download HF models only if skip_checks is False
-        if not skip_checks:
-            # Process any HF models that need downloading
-            hf_models = [m for m in df["model_path"].unique() if not Path(m).exists()]
-            if hf_models:
-                model_path_map = _process_model_paths(hf_models)
-                # Update the dataframe with processed HF models
-                for idx, row in df.iterrows():
-                    if row["model_path"] in model_path_map:
-                        # This shouldn't expand further, just update the path
-                        df.at[idx, "model_path"] = model_path_map[row["model_path"]][0]
-        else:
-            logging.info(
-                "Skipping model path processing and validation (--skip-checks enabled)"
-            )
-
-    elif models and ((tasks and n_shot is not None) or task_groups):
-        model_list = [m.strip() for m in models.split(",") if m.strip()]
-        model_paths: list[Path | str] = []
+        eval_jobs.extend(
+            [
+                EvaluationJob(
+                    model_path=row["model_path"],
+                    task_path=row["task_path"],
+                    n_shot=row["n_shot"],
+                    eval_suite=row["eval_suite"],
+                )
+                for _, row in df.iterrows()
+            ]
+        )
 
-        # Always expand local paths
-        for model in model_list:
-            local_paths = _expand_local_model_paths(model)
-            if local_paths:
-                model_paths.extend(local_paths)
-            else:
-                model_paths.append(model)
-
-        # Download HF models only if skip_checks is False
-        if not skip_checks:
-            hf_models = [m for m in model_paths if not Path(m).exists()]
-            if hf_models:
-                model_path_map = _process_model_paths(hf_models)
-                # Replace HF model identifiers with processed paths
-                model_paths = [
-                    model_path_map[m][0] if m in model_path_map else m
-                    for m in model_paths
+    elif models:
+        if task_groups is None:
+            eval_jobs.extend(
+                [
+                    EvaluationJob(
+                        model_path=model,
+                        task_path=task,
+                        n_shot=shot,
+                        eval_suite="lm_eval",
+                    )
+                    for model in models
+                    for task in tasks
+                    for shot in n_shot
                 ]
+            )
         else:
-            logging.info(
-                "Skipping model path processing and validation (--skip-checks enabled)"
+            expanded = _expand_task_groups([g.strip() for g in task_groups.split(",")])
+            eval_jobs.extend(
+                [
+                    EvaluationJob(
+                        model_path=model,
+                        task_path=result.task,
+                        n_shot=result.n_shot,
+                        eval_suite=result.suite,
+                    )
+                    for model in models
+                    for result in expanded
+                ]
             )
 
-        rows: list[dict[str, Path | str | int]] = []
-
-        # Handle explicit tasks (lm_eval) with provided n_shot
-        if tasks:
-            if n_shot is None:
-                raise ValueError(
-                    "When specifying `tasks`, you must also provide `n_shot`. For task groups, use `task_groups`."
+    expanded_eval_jobs = []
+    for job in eval_jobs:
+        local_model_paths = _expand_local_model_paths(job.model_path)
+        if not local_model_paths:
+            expanded_eval_jobs.append(job)
+        else:
+            for path in local_model_paths:
+                expanded_eval_jobs.append(
+                    EvaluationJob(
+                        model_path=path,
+                        task_path=job.task_path,
+                        n_shot=job.n_shot,
+                        eval_suite=job.eval_suite,
+                    )
                 )
-            tasks_list = [t.strip() for t in tasks.split(",") if t.strip()]
-            shots: list[int]
-            shots = n_shot if isinstance(n_shot, list) else [int(n_shot)]
-            for model_path in model_paths:
-                for task_name in tasks_list:
-                    for s in shots:
-                        rows.append(
-                            {
-                                "model_path": model_path,
-                                "task_path": task_name,
-                                "n_shot": int(s),
-                                "eval_suite": "lm_eval",
-                            }
-                        )
 
-        # Handle task groups
-        if task_groups:
-            group_names = [g.strip() for g in task_groups.split(",") if g.strip()]
-            # import pdb; pdb.set_trace()
-            expanded = _expand_task_groups(group_names)
-            for model_path in model_paths:
-                for task_name, n_shots, suite_name in expanded:
-                    for s in n_shots:
-                        rows.append(
-                            {
-                                "model_path": model_path,
-                                "task_path": task_name,
-                                "n_shot": int(s),
-                                "eval_suite": suite_name,
-                            }
-                        )
-
-        df = pd.DataFrame(
-            rows, columns=["model_path", "task_path", "n_shot", "eval_suite"]
-        )
+    if not skip_checks:
+        hub_models: set[str | Path] = {
+            job.model_path
+            for job in expanded_eval_jobs
+            if not Path(job.model_path).exists()
+        }
+        _process_model_paths(hub_models)
     else:
-        raise ValueError(
-            "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`."
+        logging.info(
+            "Skipping model path processing and validation (--skip-checks enabled)"
         )
 
+    # create csv
+    df = pd.DataFrame(expanded_eval_jobs)
+
     if df.empty:
         logging.warning("No evaluation jobs to schedule.")
         return None
 
+    df["eval_suite"] = df["eval_suite"].str.lower()
+
     # Ensure that all datasets required by the tasks are cached locally to avoid
     # network access on compute nodes.
     if not skip_checks:
-        lm_eval_tasks = df[
-            df["eval_suite"].str.lower().isin({"lm_eval", "lm-eval", "lm-eval-harness"})
-        ]["task_path"].unique()
+        lm_eval_tasks = df[df["eval_suite"].isin({"lm-eval-harness"})][
+            "task_path"
+        ].unique()
         if len(lm_eval_tasks) > 0:
             _pre_download_task_datasets(
                 lm_eval_tasks, trust_remote_code=trust_remote_code
             )
         # Pre-download LightEval datasets (best-effort, incremental support)
-        light_eval_tasks = df[
-            df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})
-        ]["task_path"].unique()
+        light_eval_tasks = df[df["eval_suite"].isin({"light-eval"})]["task_path"].unique()
         if len(light_eval_tasks) > 0:
             _pre_download_lighteval_datasets(light_eval_tasks)
     else:
@@ -239,8 +216,9 @@ def schedule_evals(
     if download_only:
         return None
 
-    queue_limit = int(os.environ.get("QUEUE_LIMIT", 250))
-    remaining_queue_capacity = queue_limit - _num_jobs_in_queue()
+    remaining_queue_capacity = (
+        int(os.environ.get("QUEUE_LIMIT", 250)) - _num_jobs_in_queue()
+    )
 
     if remaining_queue_capacity <= 0:
         logging.warning("No remaining queue capacity. Not scheduling any jobs.")
@@ -269,61 +247,24 @@ def schedule_evals(
 
     df.to_csv(csv_path, index=False)
 
-    logging.debug(f"Saved evaluation dataframe to temporary CSV: {csv_path}")
-
     sbatch_template = (files("oellm.resources") / "template.sbatch").read_text()
 
     # Calculate dynamic array size and time limits
     total_evals = len(df)
-
-    # fixed timing estimation
     minutes_per_eval = 10  # Budget 10 minutes per eval
     total_minutes = total_evals * minutes_per_eval
-
-    # Copy LightEval benchmark files into evaluation directory if necessary
-    # TODO: why do we need this?
-    light_eval_paths = df[df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})][
-        "task_path"
-    ].unique()
-    benchmark_dir = evals_dir / "light_eval_tasks"
-    copied_paths: dict[str, str] = {}
-    if light_eval_paths.size > 0:
-        benchmark_dir.mkdir(parents=True, exist_ok=True)
-        for task_path in light_eval_paths:
-            candidate = Path(task_path)
-            if candidate.exists() and candidate.is_file():
-                destination = benchmark_dir / candidate.name
-                shutil.copy(candidate, destination)
-                copied_paths[str(candidate)] = str(destination)
-
-    if copied_paths:
-        df.replace({"task_path": copied_paths}, inplace=True)
-
-    # Maximum runtime per job (18 hours with safety margin)
     max_minutes_per_job = 18 * 60  # 18 hours
     min_array_size_for_time = max(1, int(np.ceil(total_minutes / max_minutes_per_job)))
     desired_array_size = min(128, total_evals) if total_evals >= 128 else total_evals
     if desired_array_size < min_array_size_for_time:
         desired_array_size = min_array_size_for_time
-
-    # The actual array size is limited by queue capacity and total evals
     actual_array_size = min(remaining_queue_capacity, desired_array_size, total_evals)
-
-    # Calculate actual time per job
     evals_per_job = max(1, int(np.ceil(total_evals / actual_array_size)))
     minutes_per_job = evals_per_job * minutes_per_eval
-
-    # Add 20% safety margin and round up to nearest hour
     minutes_with_margin = int(minutes_per_job * 1.2)
     hours_with_margin = max(1, int(np.ceil(minutes_with_margin / 60)))
-
-    # Apply 3-hour safety minimum for array jobs
     hours_with_margin = max(hours_with_margin, 3)
-
-    # Cap at 24 hours
     hours_with_margin = min(hours_with_margin, 23)
-
-    # Format time limit for SLURM (HH:MM:SS)
     time_limit = f"{hours_with_margin:02d}:59:00"
 
     # Log the calculated values
@@ -343,8 +284,6 @@ def schedule_evals(
     )
     logging.info(f"   Time limit with safety margin: {time_limit}")
 
-    # replace the placeholders in the template with the actual values
-    # First, replace python-style placeholders
     sbatch_script = sbatch_template.format(
         csv_path=csv_path,
         max_array_len=max_array_len,
@@ -356,13 +295,10 @@ def schedule_evals(
         time_limit=time_limit,  # Dynamic time limit
     )
 
-    # substitute any $ENV_VAR occurrences (e.g., $TIME_LIMIT) since env vars are not
-    # expanded in the #SBATCH directives
+    # substitute any $ENV_VAR occurrences
     sbatch_script = Template(sbatch_script).safe_substitute(os.environ)
 
-    # Save the sbatch script to the evals directory
     sbatch_script_path = evals_dir / "submit_evals.sbatch"
-    logging.debug(f"Saving sbatch script to {sbatch_script_path}")
 
     with open(sbatch_script_path, "w") as f:
         f.write(sbatch_script)
@@ -555,7 +491,7 @@ def collect_results(
 
         # Print summary statistics
         if verbose:
-            logging.info("\nSummary:")
+            logging.info("Summary:")
             logging.info(f"Unique models: {df['model_name'].nunique()}")
             logging.info(f"Unique tasks: {df['task'].nunique()}")
             logging.info(
@@ -564,7 +500,7 @@ def collect_results(
 
     # Perform check analysis if requested
     if check:
-        logging.info("\n=== Evaluation Status Check ===")
+        logging.info("=== Evaluation Status Check ===")
 
         # Find missing jobs
         missing_jobs = []
@@ -599,7 +535,7 @@ def collect_results(
 
         completed_count = len(jobs_df) - len(missing_jobs)
 
-        logging.info(f"\nTotal scheduled jobs: {len(jobs_df)}")
+        logging.info(f"Total scheduled jobs: {len(jobs_df)}")
         logging.info(f"Completed jobs: {completed_count}")
         logging.info(f"Missing jobs: {len(missing_jobs)}")
 
@@ -607,14 +543,14 @@ def collect_results(
             missing_df = pd.DataFrame(missing_jobs)
             missing_csv = output_csv.replace(".csv", "_missing.csv")
             missing_df.to_csv(missing_csv, index=False)
-            logging.info(f"\nMissing jobs saved to: {missing_csv}")
+            logging.info(f"Missing jobs saved to: {missing_csv}")
             logging.info(
                 f"You can run these with: oellm schedule-eval --eval_csv_path {missing_csv}"
             )
 
             # Show some examples if verbose
             if verbose and len(missing_jobs) > 0:
-                logging.info("\nExample missing jobs:")
+                logging.info("Example missing jobs:")
                 for _i, (_, job) in enumerate(missing_df.head(5).iterrows()):
                     logging.info(
                         f"  - {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}"
@@ -624,6 +560,7 @@ def collect_results(
 
 
 def main():
+    _filter_warnings()
     auto_cli(
         {
             "schedule-eval": schedule_evals,
diff --git a/oellm/task_cache.py b/oellm/task_cache.py
index 2fdeea0..7b58e52 100644
--- a/oellm/task_cache.py
+++ b/oellm/task_cache.py
@@ -284,21 +284,13 @@ def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True
         if not isinstance(call, dict):
             continue
         # Unified prewarm log message
-        dataset_id = None
         if call.get("type") == "load_dataset":
             path = call.get("path")
             name = call.get("name")
-            dataset_id = f"{path}{'::' + name if name else ''}"
         else:
             repo_id = call.get("repo_id")
             filename = call.get("filename")
-            dataset_id = (
-                f"{repo_id}{'/' + filename if filename else ''}"
-                if isinstance(repo_id, str)
-                else None
-            )
-        if dataset_id:
-            logging.info(f"Prewarming dataset cache: {dataset_id}")
+
         if call.get("type") == "snapshot_download":
             repo_id = call.get("repo_id")
             if isinstance(repo_id, str) and repo_id:
diff --git a/oellm/task_groups.py b/oellm/task_groups.py
index 1419de7..df3f496 100644
--- a/oellm/task_groups.py
+++ b/oellm/task_groups.py
@@ -1,3 +1,4 @@
+from typing import TypedDict
 from collections.abc import Iterable
 from dataclasses import dataclass
 from importlib.resources import files
@@ -107,25 +108,36 @@ def _parse_task_groups(
     }
 
 
-def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]:
+@dataclass
+class TaskGroupResult:
+    task: str
+    n_shot: int
+    suite: str
+
+
+def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
     parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()])
     missing = {str(n).strip() for n in group_names if str(n).strip()} - set(parsed.keys())
     if missing:
         raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}")
 
-    results: list[tuple[str, list[int], str]] = []
+    results: list[TaskGroupResult] = []
 
     for _, group in parsed.items():
         if isinstance(group, TaskGroup):
             suite = group.suite
             for t in group.tasks:
                 shots = [int(s) for s in (t.n_shots or [])]
-                results.append((t.name, shots, suite))
+                for shot in shots: 
+                    results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite))
         else:
             for g in group.task_groups:
                 suite = g.suite
                 for t in g.tasks:
                     shots = [int(s) for s in (t.n_shots or [])]
-                    results.append((t.name, shots, suite))
+                    for shot in shots:
+                        results.append(
+                            TaskGroupResult(task=t.name, n_shot=shot, suite=suite)
+                        )
 
     return results

From 4f9f8a8e88522c63ae9a45ed0010e4f53eff7bc1 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 20:51:30 +0200
Subject: [PATCH 34/39] feat: adding spinners

---
 oellm/utils.py | 419 +++++++++++++++++++++++--------------------------
 1 file changed, 200 insertions(+), 219 deletions(-)

diff --git a/oellm/utils.py b/oellm/utils.py
index 90d0660..ef2ed5f 100644
--- a/oellm/utils.py
+++ b/oellm/utils.py
@@ -25,90 +25,32 @@
     task_cache_set_payload,
 )
 
+_RICH_CONSOLE: Console | None = None
 
-@contextmanager
-def suppress_tqdm_rendering(enabled: bool = True):
-    """
-    Temporarily suppresses tqdm progress bar rendering when enabled=True.
-
-    This prevents any visual rendering by overriding the class methods
-    responsible for output, without altering other behavior.
-    """
-    if not enabled:
-        yield
-        return
-
-    import tqdm as _tqdm
-    from tqdm import auto as _tqdm_auto
-
-    classes = [_tqdm.tqdm, _tqdm_auto.tqdm]
-    seen: set[int] = set()
-    patched: list[tuple[object, str, object]] = []
-
-    for cls in classes:
-        cid = id(cls)
-        if cid in seen:
-            continue
-        seen.add(cid)
-
-        if hasattr(cls, "display"):
-            orig_display = cls.display  # type: ignore[attr-defined]
-
-            def _noop_display(self, *args, **kwargs):
-                return None
-
-            cls.display = _noop_display  # type: ignore[assignment]
-            patched.append((cls, "display", orig_display))
-
-        if hasattr(cls, "refresh"):
-            orig_refresh = cls.refresh  # type: ignore[attr-defined]
-
-            def _noop_refresh(self, *args, **kwargs):
-                return None
-
-            cls.refresh = _noop_refresh  # type: ignore[assignment]
-            patched.append((cls, "refresh", orig_refresh))
-
-    try:
-        yield
-    finally:
-        for cls, name, orig in patched:
-            setattr(cls, name, orig)
-
-
-def filter_tqdm(enabled: bool = True):
-    """
-    Decorator factory to suppress tqdm rendering for the wrapped function
-    when enabled=True.
-    """
 
-    def _decorator(func):
-        @wraps(func)
-        def _wrapper(*args, **kwargs):
-            with suppress_tqdm_rendering(enabled=enabled):
-                return func(*args, **kwargs)
+def get_console() -> Console:
+    global _RICH_CONSOLE
+    if _RICH_CONSOLE is None:
+        _RICH_CONSOLE = Console()
+    return _RICH_CONSOLE
 
-        return _wrapper
-
-    return _decorator
 
-
-@filter_tqdm(enabled=False)
 def _ensure_singularity_image(image_name: str) -> None:
     from huggingface_hub import hf_hub_download
 
     image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name
 
-    logging.info(f"Downloading latest Singularity image from HuggingFace: {image_name}")
-
     try:
-        hf_hub_download(
-            repo_id="openeurollm/evaluation_singularity_images",
-            filename=image_name,
-            repo_type="dataset",
-            local_dir=os.getenv("EVAL_BASE_DIR"),
-        )
-        logging.info("Successfully downloaded latest Singularity image from HuggingFace")
+        console = get_console()
+        with console.status(
+            "Downloading latest Singularity image from HuggingFace", spinner="dots"
+        ):
+            hf_hub_download(
+                repo_id="openeurollm/evaluation_singularity_images",
+                filename=image_name,
+                repo_type="dataset",
+                local_dir=os.getenv("EVAL_BASE_DIR"),
+            )
     except Exception as e:
         logging.warning(
             "Failed to fetch latest container image from HuggingFace: %s", str(e)
@@ -121,15 +63,10 @@ def _ensure_singularity_image(image_name: str) -> None:
                 f"Cannot proceed with evaluation scheduling."
             ) from e
 
-    logging.info(
-        "Singularity image ready at %s",
-        Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"),
-    )
-
 
 def _setup_logging(verbose: bool = False):
     rich_handler = RichHandler(
-        console=Console(),
+        console=get_console(),
         show_time=True,
         log_time_format="%H:%M:%S",
         show_path=False,
@@ -208,7 +145,7 @@ def _num_jobs_in_queue() -> int:
     return sum(1 for line in output.splitlines() if line.strip())
 
 
-def _expand_local_model_paths(model: str) -> list[Path]:
+def _expand_local_model_paths(model: str | Path) -> list[Path]:
     """
     Expands a local model path to include all checkpoints if it's a directory.
     Recursively searches for models in subdirectories.
@@ -257,8 +194,7 @@ def _expand_local_model_paths(model: str) -> list[Path]:
     return model_paths
 
 
-@filter_tqdm(enabled=True)
-def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
+def _process_model_paths(models: Iterable[str]):
     """
     Processes model strings into a dict of model paths.
 
@@ -267,86 +203,102 @@ def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]:
     """
     from huggingface_hub import snapshot_download
 
-    processed_model_paths: dict[str, list[Path | str]] = {}
+    console = get_console()
+    models_list = list(models)
 
-    for model in models:
-        per_model_paths: list[Path | str] = []
+    with console.status(
+        f"Processing models… 0/{len(models_list)}", spinner="dots"
+    ) as status:
+        for idx, model in enumerate(models_list, 1):
+            status.update(f"Checking model '{model}' ({idx}/{len(models_list)})")
+            per_model_paths: list[Path | str] = []
 
-        local_paths = _expand_local_model_paths(model)
-        if local_paths:
-            per_model_paths.extend(local_paths)
-        else:
-            logging.info(
-                f"Model {model} not found locally, assuming it is a 🤗 hub model"
-            )
-            logging.debug(
-                f"Downloading model {model} on the login node since the compute nodes may not have access to the internet"
-            )
-
-            if "," in model:
-                model_kwargs = dict(
-                    [kv.split("=") for kv in model.split(",") if "=" in kv]
+            local_paths = _expand_local_model_paths(model)
+            if local_paths:
+                per_model_paths.extend(local_paths)
+                status.update(f"Using local model '{model}' ({idx}/{len(models_list)})")
+            else:
+                logging.info(
+                    f"Model {model} not found locally, assuming it is a 🤗 hub model"
                 )
+                logging.debug(
+                    f"Downloading model {model} on the login node since the compute nodes may not have access to the internet"
+                )
+
+                if "," in model:
+                    model_kwargs = dict(
+                        [kv.split("=") for kv in model.split(",") if "=" in kv]
+                    )
 
-                repo_id = model.split(",")[0]
+                    repo_id = model.split(",")[0]
 
-                snapshot_kwargs = {}
-                if "revision" in model_kwargs:
-                    snapshot_kwargs["revision"] = model_kwargs["revision"]
+                    snapshot_kwargs = {}
+                    if "revision" in model_kwargs:
+                        snapshot_kwargs["revision"] = model_kwargs["revision"]
 
-                try:
+                    status.update(f"Downloading '{repo_id}' ({idx}/{len(models_list)})")
+                    try:
+                        snapshot_download(
+                            repo_id=repo_id,
+                            cache_dir=Path(os.getenv("HF_HOME")) / "hub",
+                            **snapshot_kwargs,
+                        )
+                        per_model_paths.append(model)
+                    except Exception as e:
+                        logging.debug(
+                            f"Failed to download model {model} from Hugging Face Hub. Continuing..."
+                        )
+                        logging.debug(e)
+                else:
+                    status.update(f"Downloading '{model}' ({idx}/{len(models_list)})")
                     snapshot_download(
-                        repo_id=repo_id,
+                        repo_id=model,
                         cache_dir=Path(os.getenv("HF_HOME")) / "hub",
-                        **snapshot_kwargs,
                     )
                     per_model_paths.append(model)
-                except Exception as e:
-                    logging.debug(
-                        f"Failed to download model {model} from Hugging Face Hub. Continuing..."
-                    )
-                    logging.debug(e)
-            else:
-                snapshot_download(
-                    repo_id=model,
-                    cache_dir=Path(os.getenv("HF_HOME")) / "hub",
-                )
-                per_model_paths.append(model)
-
-        if not per_model_paths:
-            logging.warning(
-                f"Could not find any valid model for '{model}'. It will be skipped."
-            )
-        processed_model_paths[model] = per_model_paths
 
-    return processed_model_paths
+            if not per_model_paths:
+                logging.warning(
+                    f"Could not find any valid model for '{model}'. It will be skipped."
+                )
 
 
-@filter_tqdm(enabled=True)
 def _pre_download_task_datasets(
     tasks: Iterable[str], trust_remote_code: bool = True
 ) -> None:
     processed: set[str] = set()
 
     misses: list[str] = []
-    for task_name in tasks:
-        if not isinstance(task_name, str) or task_name in processed:
-            continue
-        processed.add(task_name)
-        if task_cache_lookup("lm-eval", task_name):
-            logging.info(
-                f"Skipping dataset preparation for task '{task_name}' (cache hit within TTL)."
+    console = get_console()
+    with console.status("Checking lm-eval datasets…", spinner="dots") as status:
+        cache_hits = 0
+        for task_name in tasks:
+            if not isinstance(task_name, str) or task_name in processed:
+                continue
+            processed.add(task_name)
+            if task_cache_lookup("lm-eval", task_name):
+                cache_hits += 1
+                status.update(
+                    f"Checking lm-eval datasets… {cache_hits} cached, {len(misses)} to prepare"
+                )
+                continue
+            misses.append(task_name)
+            status.update(
+                f"Checking lm-eval datasets… {cache_hits} cached, {len(misses)} to prepare"
             )
-            continue
-        misses.append(task_name)
 
     if not misses:
-        for task_name in processed:
-            if task_cache_lookup("lm-eval", task_name):
-                prewarm_from_payload(
-                    task_cache_get_payload("lm-eval", task_name),
-                    trust_remote_code=trust_remote_code,
-                )
+        with console.status(
+            f"Using cached lm-eval datasets for {len(processed)} tasks…",
+            spinner="dots",
+        ) as status:
+            for task_name in processed:
+                if task_cache_lookup("lm-eval", task_name):
+                    status.update(f"Loading cached dataset for '{task_name}'…")
+                    prewarm_from_payload(
+                        task_cache_get_payload("lm-eval", task_name),
+                        trust_remote_code=trust_remote_code,
+                    )
         return
 
     from datasets import DownloadMode  # type: ignore
@@ -354,96 +306,115 @@ def _pre_download_task_datasets(
 
     tm = TaskManager()
 
-    for task_name in misses:
-        logging.info(
-            f"Preparing dataset for task '{task_name}' (download if not cached)…"
-        )
-
-        task_config = {
-            "task": task_name,
-            "dataset_kwargs": {"trust_remote_code": trust_remote_code},
-        }
-
-        with capture_hf_dataset_calls() as captured_calls:
-            task_objects = tm.load_config(task_config)
-
-            stack = [task_objects]
-            while stack:
-                current = stack.pop()
-                if isinstance(current, dict):
-                    stack.extend(current.values())
-                    continue
-                if hasattr(current, "download") and callable(current.download):
-                    try:
-                        current.download(
-                            download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS
-                        )  # type: ignore[arg-type]
-                    except TypeError as e:
-                        logging.error(
-                            f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}"
-                        )
-                        current.download()  # type: ignore[misc]
-
-        if captured_calls:
-            payload = {"calls": dedupe_calls(captured_calls)}
-            task_cache_set_payload("lm-eval", task_name, payload)
-        task_cache_mark_resolved("lm-eval", task_name)
-        logging.debug(f"Finished dataset preparation for task '{task_name}'.")
+    with console.status(
+        f"Preparing lm-eval datasets… {len(misses)} remaining",
+        spinner="dots",
+    ) as status:
+        for idx, task_name in enumerate(misses, 1):
+            status.update(f"Preparing dataset for '{task_name}' ({idx}/{len(misses)})")
+
+            task_config = {
+                "task": task_name,
+                "dataset_kwargs": {"trust_remote_code": trust_remote_code},
+            }
+
+            with capture_hf_dataset_calls() as captured_calls:
+                task_objects = tm.load_config(task_config)
+
+                stack = [task_objects]
+                while stack:
+                    current = stack.pop()
+                    if isinstance(current, dict):
+                        stack.extend(current.values())
+                        continue
+                    if hasattr(current, "download") and callable(current.download):
+                        try:
+                            current.download(
+                                download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS
+                            )  # type: ignore[arg-type]
+                        except TypeError as e:
+                            logging.error(
+                                f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}"
+                            )
+                            current.download()  # type: ignore[misc]
+
+            if captured_calls:
+                payload = {"calls": dedupe_calls(captured_calls)}
+                task_cache_set_payload("lm-eval", task_name, payload)
+            task_cache_mark_resolved("lm-eval", task_name)
+            logging.debug(f"Finished dataset preparation for task '{task_name}'.")
 
 
-@filter_tqdm(enabled=True)
 def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None:
     seen: set[str] = set()
     misses: list[str] = []
     tasks = [str(task).strip() for task in tasks]
-    for task in tasks:
-        if not task or task in seen:
-            continue
-        seen.add(task)
-        if task_cache_lookup("lighteval", task):
-            logging.info(
-                f"Skipping dataset preparation for task '{task}' (cache hit within TTL)."
-            )
-            continue
-        misses.append(task)
-
-    if not misses:
-        for task in seen:
+    console = get_console()
+    with console.status("Checking lighteval datasets…", spinner="dots") as status:
+        cache_hits = 0
+        for task in tasks:
+            if not task or task in seen:
+                continue
+            seen.add(task)
             if task_cache_lookup("lighteval", task):
-                prewarm_from_payload(
-                    task_cache_get_payload("lighteval", task),
-                    trust_remote_code=True,
+                cache_hits += 1
+                status.update(
+                    f"Checking lighteval datasets… {cache_hits} cached, {len(misses)} to prepare"
                 )
-        return
-
-    for task in misses:
-        with capture_hf_dataset_calls() as captured_calls:
-            from lighteval.tasks.lighteval_task import LightevalTask
-            from lighteval.tasks.registry import (
-                TRUNCATE_FEW_SHOTS_DEFAULTS,
-                Registry,
+                continue
+            misses.append(task)
+            status.update(
+                f"Checking lighteval datasets… {cache_hits} cached, {len(misses)} to prepare"
             )
 
-            reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
-            truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
-
-            spec = task
-            if "|" not in spec:
-                spec = f"lighteval|{spec}|0|{truncate_default}"
-            elif spec.count("|") == 1:
-                spec = f"{spec}|0|{truncate_default}"
-            elif spec.count("|") == 2:
-                spec = f"{spec}|{truncate_default}"
+    if not misses:
+        with console.status(
+            f"Using cached lighteval datasets for {len(seen)} tasks…",
+            spinner="dots",
+        ):
+            for task in seen:
+                if task_cache_lookup("lighteval", task):
+                    prewarm_from_payload(
+                        task_cache_get_payload("lighteval", task),
+                        trust_remote_code=True,
+                    )
+        return
 
-            configs = reg.get_tasks_configs(spec)
-            task_dict = reg.get_tasks_from_configs(configs)
-            LightevalTask.load_datasets(task_dict)
+    with console.status(
+        f"Preparing lighteval datasets… {len(misses)} remaining",
+        spinner="dots",
+    ) as status:
+        for idx, task in enumerate(misses, 1):
+            status.update(f"Preparing dataset for '{task}' ({idx}/{len(misses)})")
+            with capture_hf_dataset_calls() as captured_calls:
+                from lighteval.tasks.lighteval_task import LightevalTask
+                from lighteval.tasks.registry import (
+                    TRUNCATE_FEW_SHOTS_DEFAULTS,
+                    Registry,
+                )
 
-        payload = (
-            {"calls": dedupe_calls(captured_calls)} if captured_calls else {"calls": []}
-        )
-        task_cache_set_payload("lighteval", task, payload)
-        task_cache_mark_resolved("lighteval", task)
+                reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks")
+                truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS)
+
+                spec = task
+                if "|" not in spec:
+                    spec = f"lighteval|{spec}|0|{truncate_default}"
+                elif spec.count("|") == 1:
+                    spec = f"{spec}|0|{truncate_default}"
+                elif spec.count("|") == 2:
+                    spec = f"{spec}|{truncate_default}"
+
+                configs = reg.get_tasks_configs(spec)
+                task_dict = reg.get_tasks_from_configs(configs)
+                LightevalTask.load_datasets(task_dict)
+
+            payload = (
+                {"calls": dedupe_calls(captured_calls)}
+                if captured_calls
+                else {"calls": []}
+            )
+            task_cache_set_payload("lighteval", task, payload)
+            task_cache_mark_resolved("lighteval", task)
 
 
 @contextmanager
@@ -546,3 +517,13 @@ def _wrapper(*args, **kwargs):
         return _wrapper
 
     return _decorator
+
+
+def _filter_warnings():
+    """
+    Filters warnings from the lm_eval and lighteval libraries.
+    """
+    import warnings
+
+    warnings.filterwarnings("ignore", module="lm_eval")
+    warnings.filterwarnings("ignore", module="lighteval")

From fe067fac795f40b5a50cec69c812667b8b19b59d Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 20:53:27 +0200
Subject: [PATCH 35/39] chore: making pre-commit happy

---
 .github/workflows/ci.yml         | 1 -
 .gitignore                       | 2 +-
 .pre-commit-config.yaml          | 3 +--
 apptainer/build_sif_local.sh     | 2 +-
 apptainer/jureca.def             | 4 ++--
 apptainer/leonardo.def           | 4 ++--
 apptainer/lumi.def               | 4 ++--
 oellm/resources/task-groups.yaml | 4 ++--
 oellm/task_cache.py              | 1 -
 oellm/task_groups.py             | 3 +--
 10 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8a03dd5..6cf30a0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -51,4 +51,3 @@ jobs:
       run: |
         uvx yamllint . || true
       continue-on-error: true
-
diff --git a/.gitignore b/.gitignore
index 9e29ad2..b897fcc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@
 **/*.egg-info
 **/*.csv
 **/uv.lock
-**/task_map_cache.json
\ No newline at end of file
+**/task_map_cache.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 94ccd57..f47629e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,6 @@
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
@@ -18,4 +18,3 @@ repos:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
-
diff --git a/apptainer/build_sif_local.sh b/apptainer/build_sif_local.sh
index 5b1fed5..6561919 100755
--- a/apptainer/build_sif_local.sh
+++ b/apptainer/build_sif_local.sh
@@ -28,4 +28,4 @@ for def in "${APPTAINER_DIR}"/*.def; do
   build_one "${def}"
 done
 
-echo "\nAll SIF images built successfully. Find them under: ${OUTPUT_DIR}" 
\ No newline at end of file
+echo "\nAll SIF images built successfully. Find them under: ${OUTPUT_DIR}"
diff --git a/apptainer/jureca.def b/apptainer/jureca.def
index 28a0391..7f088ad 100644
--- a/apptainer/jureca.def
+++ b/apptainer/jureca.def
@@ -26,7 +26,7 @@ import nltk
 nltk.download('punkt', download_dir='/opt/nltk_data')
 nltk.download('punkt_tab', download_dir='/opt/nltk_data')
 PY
-    
+
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
@@ -35,4 +35,4 @@ PY
 
 
 %runscript
-    exec bash "$@" 
\ No newline at end of file
+    exec bash "$@"
diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def
index c9b2d74..f61f282 100644
--- a/apptainer/leonardo.def
+++ b/apptainer/leonardo.def
@@ -26,7 +26,7 @@ import nltk
 nltk.download('punkt', download_dir='/opt/nltk_data')
 nltk.download('punkt_tab', download_dir='/opt/nltk_data')
 PY
-    
+
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
@@ -34,4 +34,4 @@ PY
     export NLTK_DATA=/opt/nltk_data
 
 %runscript
-    exec bash "$@" 
\ No newline at end of file
+    exec bash "$@"
diff --git a/apptainer/lumi.def b/apptainer/lumi.def
index 815d3a1..c19f85f 100644
--- a/apptainer/lumi.def
+++ b/apptainer/lumi.def
@@ -26,7 +26,7 @@ import nltk
 nltk.download('punkt', download_dir='/opt/nltk_data')
 nltk.download('punkt_tab', download_dir='/opt/nltk_data')
 PY
-    
+
 %environment
     export PATH=/usr/local/bin:$PATH
     export UV_TOOL_BIN_DIR=/usr/local/bin
@@ -34,4 +34,4 @@ PY
     export NLTK_DATA=/opt/nltk_data
 
 %runscript
-    exec bash "$@" 
\ No newline at end of file
+    exec bash "$@"
diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml
index 957581e..69ca6c8 100644
--- a/oellm/resources/task-groups.yaml
+++ b/oellm/resources/task-groups.yaml
@@ -143,7 +143,7 @@ task_groups:
       - task: mgsm_native_cot_de
       - task: mgsm_native_cot_es
       - task: mgsm_native_cot_fr
-  
+
   generic-multilingual:
     description: "Generic multilingual benchmarks in Aya Expanse"
     suite: lm-eval-harness
@@ -153,7 +153,7 @@ task_groups:
       - task: xcopa
       - task: xstorycloze
 
-  include: 
+  include:
     description: "INCLUDE benchmarks in Aya Expanse"
     suite: lm-eval-harness
     n_shots: [0]
diff --git a/oellm/task_cache.py b/oellm/task_cache.py
index 7b58e52..a320bee 100644
--- a/oellm/task_cache.py
+++ b/oellm/task_cache.py
@@ -1,5 +1,4 @@
 import json
-import logging
 from contextlib import contextmanager
 from contextvars import ContextVar
 from datetime import datetime
diff --git a/oellm/task_groups.py b/oellm/task_groups.py
index df3f496..73c7d35 100644
--- a/oellm/task_groups.py
+++ b/oellm/task_groups.py
@@ -1,4 +1,3 @@
-from typing import TypedDict
 from collections.abc import Iterable
 from dataclasses import dataclass
 from importlib.resources import files
@@ -128,7 +127,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]:
             suite = group.suite
             for t in group.tasks:
                 shots = [int(s) for s in (t.n_shots or [])]
-                for shot in shots: 
+                for shot in shots:
                     results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite))
         else:
             for g in group.task_groups:

From f552c96e4ff213d8b0f745587957ff6f3e62ce0d Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 21:01:44 +0200
Subject: [PATCH 36/39] misc

---
 .github/workflows/ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6cf30a0..0d9fa48 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,7 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v3
+      uses: astral-sh/setup-uv@v7
       with:
         version: "latest"
 
@@ -40,7 +40,7 @@ jobs:
     - uses: actions/checkout@v4
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v3
+      uses: astral-sh/setup-uv@v7
       with:
         version: "latest"
 

From 9bbf5c10083da83c3ff22318aa3cfe93fc9b46bc Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Wed, 22 Oct 2025 23:21:42 +0200
Subject: [PATCH 37/39] fix: restrict model parallel

---
 oellm/resources/template.sbatch | 1 +
 1 file changed, 1 insertion(+)

diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch
index 16cf705..de1aa69 100644
--- a/oellm/resources/template.sbatch
+++ b/oellm/resources/template.sbatch
@@ -131,6 +131,7 @@ do
 
             singularity exec $SINGULARITY_ARGS \
                 --bind $BIND_PATHS \
+                --env CUDA_VISIBLE_DEVICES=$SLURM_GPUS_PER_NODE \
                 $EVAL_SIF_PATH \
                 lighteval accelerate \
                     "model_name=$model_path,trust_remote_code=True" \

From 1b81460cd217acfc6e5cc3aee9cc20498fc4b0bd Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Thu, 23 Oct 2025 09:39:11 +0200
Subject: [PATCH 38/39] fix: result collection

---
 oellm/main.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/oellm/main.py b/oellm/main.py
index 92bd230..c1ffac2 100644
--- a/oellm/main.py
+++ b/oellm/main.py
@@ -434,20 +434,107 @@ def collect_results(
         results = data.get("results", {})
         n_shot_data = data.get("n-shot", {})
 
+        # Infer a global n_shot if exactly one unique value exists in this JSON
+        global_n_shot = None
+        try:
+            candidate_values = []
+            for _v in n_shot_data.values():
+                if isinstance(_v, (int | float)):
+                    candidate_values.append(int(_v))
+                elif isinstance(_v, str) and _v.isdigit():
+                    candidate_values.append(int(_v))
+            unique_values = set(candidate_values)
+            if len(unique_values) == 1:
+                global_n_shot = next(iter(unique_values))
+        except Exception:
+            pass
+
+        # Aggregate groups (lm-eval harness)
+        groups_map = data.get("groups", {})
+        group_subtasks_map = data.get("group_subtasks", {})
+        group_aggregate_names = set(groups_map.keys()) | set(group_subtasks_map.keys())
+        group_subtask_names: set[str] = set()
+        for _agg, _subs in group_subtasks_map.items():
+            for _s in _subs:
+                group_subtask_names.add(_s)
+
+        # Prefer only the first aggregate metric from groups (simplified)
+        if groups_map:
+            group_name, group_results = next(iter(groups_map.items()))
+            n_shot = n_shot_data.get(group_name, "unknown")
+            if n_shot == "unknown":
+                for subtask_name in group_subtasks_map.get(group_name, []):
+                    if subtask_name in n_shot_data:
+                        n_shot = n_shot_data[subtask_name]
+                        break
+            if n_shot == "unknown" and global_n_shot is not None:
+                n_shot = global_n_shot
+            performance = group_results.get("acc,none")
+            if performance is None:
+                for metric in ["acc", "accuracy", "f1", "exact_match"]:
+                    if metric in group_results:
+                        performance = group_results[metric]
+                        break
+            if performance is not None:
+                if check:
+                    completed_jobs.add((model_name, group_name, n_shot))
+                rows.append(
+                    {
+                        "model_name": model_name,
+                        "task": group_name,
+                        "n_shot": n_shot,
+                        "performance": performance,
+                    }
+                )
+                # Skip per-task iteration when groups are present
+                continue
+
         for task_name, task_results in results.items():
+            # Skip entries already added from groups
+            if groups_map and task_name in group_aggregate_names:
+                continue
+            # Skip any lm-eval group subtasks; keep only aggregates
+            if task_name in group_subtask_names:
+                continue
+
             # Skip MMLU subtasks - only keep the aggregate score
             if task_name.startswith("mmlu_") and task_name != "mmlu":
                 continue
 
+            # Skip Global MMLU subtasks - keep only aggregates like global_mmlu_full_pt
+            if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4:
+                continue
+
             # Get n_shot for this task
             n_shot = n_shot_data.get(task_name, "unknown")
 
+            # If this is a group aggregate and n_shot is missing, derive from any subtask
+            if task_name in group_aggregate_names and n_shot == "unknown":
+                for subtask_name in group_subtasks_map.get(task_name, []):
+                    if subtask_name in n_shot_data:
+                        n_shot = n_shot_data[subtask_name]
+                        break
+            if n_shot == "unknown" and global_n_shot is not None:
+                n_shot = global_n_shot
+
             # Special handling for MMLU aggregate - get n_shot from any MMLU subtask
             if task_name == "mmlu" and n_shot == "unknown":
                 for key, value in n_shot_data.items():
                     if key.startswith("mmlu_"):
                         n_shot = value
                         break
+                if n_shot == "unknown" and global_n_shot is not None:
+                    n_shot = global_n_shot
+
+            # Special handling for Global MMLU aggregates - get n_shot from subtasks
+            if task_name.startswith("global_mmlu_") and n_shot == "unknown":
+                prefix = f"{task_name}_"
+                for key, value in n_shot_data.items():
+                    if key.startswith(prefix):
+                        n_shot = value
+                        break
+                if n_shot == "unknown" and global_n_shot is not None:
+                    n_shot = global_n_shot
 
             # Get the primary metric (usually acc,none)
             performance = task_results.get("acc,none")

From c3e0b41787530a1999a2a3deaea4cdddcf414851 Mon Sep 17 00:00:00 2001
From: timurcarstensen <timurcarstensen@gmail.com>
Date: Thu, 23 Oct 2025 09:55:06 +0200
Subject: [PATCH 39/39] fix: leonardo directory

---
 oellm/resources/clusters.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml
index 738c25a..0fa3f60 100644
--- a/oellm/resources/clusters.yaml
+++ b/oellm/resources/clusters.yaml
@@ -10,7 +10,7 @@ shared:
 
 leonardo:
   hostname_pattern: "*.leonardo.local"  # use this regexp to automatically assign environment variables corresponding to this YAML
-  EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/shared_evals"
+  EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals"
   PARTITION: "boost_usr_prod"  # default partition to use
   ACCOUNT: "AIFAC_L01_028"  # default account to use
   QUEUE_LIMIT: 1000  # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS
@@ -28,7 +28,7 @@ jureca:
 
 lumi:
   hostname_pattern: "uan*"
-  EVAL_BASE_DIR: "/pfs/lustrep4/scratch/project_462000963/shared_evals"
+  EVAL_BASE_DIR: "/pfs/lustrep4/scratch/project_462000963/oellm-cli-shared-evals"
   PARTITION: "small-g"
   ACCOUNT: "project_462000963"
   QUEUE_LIMIT: 210