From 3a3d6b72477b0eaee5bde5f1a5df59c5d20078a8 Mon Sep 17 00:00:00 2001 From: "Timur M. Carstensen" <40788422+timurcarstensen@users.noreply.github.com> Date: Mon, 13 Oct 2025 22:27:17 +0200 Subject: [PATCH 01/39] Use lm-eval harness for INCLUDE and global MMLU --- README.md | 4 ++ apptainer/jureca.def | 3 + apptainer/leonardo.def | 3 + apptainer/lumi.def | 3 + oellm/interactive_csv_builder.py | 67 ++++++++++++++++--- oellm/light_eval_benchmarks/flores-200-eu.txt | 44 ++++++++++++ oellm/main.py | 62 +++++++++++++++-- oellm/task-groups.yaml | 64 ++++++++++++++++++ oellm/template.sbatch | 62 ++++++++++++++--- 9 files changed, 286 insertions(+), 26 deletions(-) create mode 100644 oellm/light_eval_benchmarks/flores-200-eu.txt diff --git a/README.md b/README.md index 52d59e3..cdc3f89 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,10 @@ This will launch an interactive workflow where you can: - Configure n-shot settings - Preview and save your evaluation configuration +The resulting CSV now includes an additional `eval_suite` column that records which +evaluation framework (e.g., `lm_eval` or `lighteval`) should be used for each +task. + Otherwise you can also directly schedule using a CSV file: ```bash oellm schedule-eval --eval_csv_path custom_evals.csv diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 5aadca9..23cd237 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -17,6 +17,9 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate + # Install LightEval CLI in an isolated environment + uv tool install "lighteval[multilingual]" + %environment # Ensure uv is present inside the container runtime as well export PATH=/root/.local/bin:$PATH diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def index b4ed789..27f0eca 100644 --- a/apptainer/leonardo.def +++ b/apptainer/leonardo.def @@ -17,6 +17,9 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate + # Install LightEval CLI in an isolated environment + uv tool install "lighteval[multilingual]" + %environment # Ensure uv is present inside the container runtime as well export PATH=/root/.local/bin:$PATH diff --git a/apptainer/lumi.def b/apptainer/lumi.def index 4724845..020a0e8 100644 --- a/apptainer/lumi.def +++ b/apptainer/lumi.def @@ -17,6 +17,9 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1 uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate + # Install LightEval CLI in an isolated environment + uv tool install "lighteval[multilingual]" + %environment # Ensure uv is present inside the container runtime as well export PATH=/root/.local/bin:$PATH diff --git a/oellm/interactive_csv_builder.py b/oellm/interactive_csv_builder.py index 7d81649..9b918a2 100644 --- a/oellm/interactive_csv_builder.py +++ b/oellm/interactive_csv_builder.py @@ -115,7 +115,7 @@ def signal_handler(sig, frame): # Step 2: Configure tasks console.print("\n[bold cyan]πŸ“ Step 2: Configure Tasks[/bold cyan]") - task_configs = [] + task_configs: list[tuple[str, list[int], str]] = [] add_more = True # Load task groups from YAML file @@ -192,9 +192,10 @@ def signal_handler(sig, frame): for task_item in group_data.get("tasks", []): task_name = task_item["task"] n_shots = task_item.get("n_shots", [0]) - task_configs.append((task_name, n_shots)) + suite = task_item.get("suite", "lm_eval") + task_configs.append((task_name, n_shots, suite)) console.print( - f" [green]βœ“ Added: {task_name} with n_shot={n_shots}[/green]" + f" [green]βœ“ Added: {task_name} (suite={suite}) with n_shot={n_shots}[/green]" ) # After adding task groups, ask if user wants to add more or proceed @@ -259,17 +260,53 @@ def signal_handler(sig, frame): try: n_shots = [int(x.strip()) for x in n_shots_str.split(",")] - task_configs.append((task, n_shots)) + suite_choice = questionary.select( + f"Select evaluation suite for '{task}':", + choices=[ + questionary.Choice( + "lm_eval (lm-eval-harness)", value="lm_eval" + ), + questionary.Choice( + "lighteval (Hugging Face LightEval)", + value="lighteval", + ), + "πŸ“ Custom suite", + ], + style=custom_style, + ).ask() + + if suite_choice is None: + console.print("\n[yellow]Cancelled by user.[/yellow]") + return + + if suite_choice == "πŸ“ Custom suite": + suite = questionary.text( + "Enter suite identifier:", + instruction="(e.g., custom-eval-suite)", + style=custom_style, + ).ask() + if suite is None: + console.print("\n[yellow]Cancelled by user.[/yellow]") + return + suite = suite.strip() + if not suite: + suite = "lm_eval" + else: + suite = suite_choice + + task_configs.append((task, n_shots, suite)) console.print( - f"[green]βœ“ Added: {task} with n_shot={n_shots}[/green]" + f"[green]βœ“ Added: {task} (suite={suite}) with n_shot={n_shots}[/green]" ) except ValueError: console.print("[red]Invalid n_shot values. Skipping.[/red]") elif action == "πŸ“‹ View current tasks": console.print("\n[bold]Current tasks:[/bold]") - for i, (task, n_shots) in enumerate(task_configs, 1): - console.print(f" {i}. [green]{task}[/green] β†’ n_shot={n_shots}") + for i, (task, n_shots, suite) in enumerate(task_configs, 1): + console.print( + f" {i}. [green]{task}[/green] β†’ n_shot={n_shots} (suite={suite})" + ) console.print() elif action == "βœ… Continue to preview": @@ -285,10 +322,15 @@ def signal_handler(sig, frame): rows = [] for model in models: - for task_name, n_shots in task_configs: + for task_name, n_shots, suite in task_configs: for n_shot in n_shots: rows.append( - {"model_path": model, "task_path": task_name, "n_shot": n_shot} + { + "model_path": model, + "task_path": task_name, + "n_shot": n_shot, + "eval_suite": suite, + } ) df = pd.DataFrame(rows) @@ -302,11 +344,16 @@ def signal_handler(sig, frame): table.add_column("Model", style="cyan", no_wrap=True) table.add_column("Task", style="green") table.add_column("n_shot", justify="right", style="yellow") + table.add_column("Suite", style="magenta") # Show first 10 rows for idx, (_, row) in enumerate(df.head(10).iterrows(), 1): table.add_row( - str(idx), str(row["model_path"]), str(row["task_path"]), str(row["n_shot"]) + str(idx), + str(row["model_path"]), + str(row["task_path"]), + str(row["n_shot"]), + str(row["eval_suite"]), ) if len(df) > 10: diff --git a/oellm/light_eval_benchmarks/flores-200-eu.txt b/oellm/light_eval_benchmarks/flores-200-eu.txt new file mode 100644 index 0000000..414ad1d --- /dev/null +++ b/oellm/light_eval_benchmarks/flores-200-eu.txt @@ -0,0 +1,44 @@ +flores200:bul_Cyrl-eng_Latn|0 +flores200:ces_Latn-eng_Latn|0 +flores200:dan_Latn-eng_Latn|0 +flores200:deu_Latn-eng_Latn|0 +flores200:ell_Grek-eng_Latn|0 +flores200:eng_Latn-bul_Cyrl|0 +flores200:eng_Latn-ces_Latn|0 +flores200:eng_Latn-dan_Latn|0 +flores200:eng_Latn-deu_Latn|0 +flores200:eng_Latn-ell_Grek|0 +flores200:eng_Latn-est_Latn|0 +flores200:eng_Latn-fin_Latn|0 +flores200:eng_Latn-fra_Latn|0 +flores200:eng_Latn-hrv_Latn|0 +flores200:eng_Latn-hun_Latn|0 +flores200:eng_Latn-ita_Latn|0 +flores200:eng_Latn-lit_Latn|0 +flores200:eng_Latn-lvs_Latn|0 +flores200:eng_Latn-mlt_Latn|0 +flores200:eng_Latn-nld_Latn|0 +flores200:eng_Latn-pol_Latn|0 +flores200:eng_Latn-por_Latn|0 +flores200:eng_Latn-ron_Latn|0 +flores200:eng_Latn-slk_Latn|0 +flores200:eng_Latn-slv_Latn|0 +flores200:eng_Latn-spa_Latn|0 +flores200:eng_Latn-swe_Latn|0 +flores200:est_Latn-eng_Latn|0 +flores200:fin_Latn-eng_Latn|0 +flores200:fra_Latn-eng_Latn|0 +flores200:hrv_Latn-eng_Latn|0 +flores200:hun_Latn-eng_Latn|0 +flores200:ita_Latn-eng_Latn|0 +flores200:lit_Latn-eng_Latn|0 +flores200:lvs_Latn-eng_Latn|0 +flores200:mlt_Latn-eng_Latn|0 +flores200:nld_Latn-eng_Latn|0 +flores200:pol_Latn-eng_Latn|0 +flores200:por_Latn-eng_Latn|0 +flores200:ron_Latn-eng_Latn|0 +flores200:slk_Latn-eng_Latn|0 +flores200:slv_Latn-eng_Latn|0 +flores200:spa_Latn-eng_Latn|0 +flores200:swe_Latn-eng_Latn|0 diff --git a/oellm/main.py b/oellm/main.py index 77d415e..94c82ad 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -1,6 +1,7 @@ import logging import os import re +import shutil import socket import subprocess from datetime import datetime @@ -443,6 +444,11 @@ def schedule_evals( f"CSV file must contain the columns: {', '.join(required_cols)}" ) + if "eval_suite" not in df.columns: + df["eval_suite"] = "lm_eval" + else: + df["eval_suite"] = df["eval_suite"].fillna("lm_eval") + # Always expand local model paths, even with skip_checks df["model_path"].unique() expanded_rows = [] @@ -460,6 +466,9 @@ def schedule_evals( expanded_rows.append(row) df = pd.DataFrame(expanded_rows) + if "eval_suite" not in df.columns: + df["eval_suite"] = "lm_eval" + # Download HF models only if skip_checks is False if not skip_checks: # Process any HF models that need downloading @@ -514,6 +523,7 @@ def schedule_evals( ), columns=["model_path", "task_path", "n_shot"], ) + df["eval_suite"] = "lm_eval" else: raise ValueError( "Either `eval_csv_path` must be provided, or all of `models`, `tasks`, and `n_shot`." @@ -526,9 +536,13 @@ def schedule_evals( # Ensure that all datasets required by the tasks are cached locally to avoid # network access on compute nodes. if not skip_checks: - _pre_download_task_datasets( - df["task_path"].unique(), trust_remote_code=trust_remote_code - ) + lm_eval_tasks = df[ + df["eval_suite"].str.lower().isin({"lm_eval", "lm-eval", "lm-eval-harness"}) + ]["task_path"].unique() + if len(lm_eval_tasks) > 0: + _pre_download_task_datasets( + lm_eval_tasks, trust_remote_code=trust_remote_code + ) else: logging.info("Skipping dataset pre-download (--skip-checks enabled)") @@ -583,7 +597,12 @@ def schedule_evals( total_minutes = 0 task_time_cache = {} # Cache to avoid recalculating for same tasks - for _, row in df.iterrows(): + lm_eval_mask = df["eval_suite"].str.lower().isin( + {"lm_eval", "lm-eval", "lm-eval-harness"} + ) + light_eval_mask = df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) + + for _, row in df[lm_eval_mask].iterrows(): task_name = row["task_path"] if task_name not in task_time_cache: task_time_cache[task_name] = _calculate_task_minutes( @@ -591,12 +610,27 @@ def schedule_evals( ) total_minutes += task_time_cache[task_name] + if light_eval_mask.any(): + # LightEval benchmarks can be large; budget 15 minutes per evaluation + light_eval_minutes = int(light_eval_mask.sum() * 15) + total_minutes += light_eval_minutes + logging.info( + "Estimated LightEval time budget: %s minutes across %s evaluations", + light_eval_minutes, + light_eval_mask.sum(), + ) + # Calculate average minutes per eval for logging purposes minutes_per_eval = total_minutes / total_evals if total_evals > 0 else 10 logging.info("πŸ“Š Dynamic time calculation:") for task_name, task_minutes in task_time_cache.items(): - task_count = (df["task_path"] == task_name).sum() + task_count = ( + (df["task_path"] == task_name) + & df["eval_suite"].str.lower().isin( + {"lm_eval", "lm-eval", "lm-eval-harness"} + ) + ).sum() logging.info( f" Task '{task_name}': {task_minutes} min/eval Γ— {task_count} evals = {task_minutes * task_count} total minutes" ) @@ -608,6 +642,24 @@ def schedule_evals( "⚠️ Using fixed 10 min/eval (task complexity detection skipped with --skip-checks)" ) + # Copy LightEval benchmark files into evaluation directory if necessary + light_eval_paths = df[ + df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) + ]["task_path"].unique() + benchmark_dir = evals_dir / "light_eval_tasks" + copied_paths: dict[str, str] = {} + if light_eval_paths.size > 0: + benchmark_dir.mkdir(parents=True, exist_ok=True) + for task_path in light_eval_paths: + candidate = Path(task_path) + if candidate.exists() and candidate.is_file(): + destination = benchmark_dir / candidate.name + shutil.copy(candidate, destination) + copied_paths[str(candidate)] = str(destination) + + if copied_paths: + df.replace({"task_path": copied_paths}, inplace=True) + # Maximum runtime per job (18 hours with safety margin) max_minutes_per_job = 18 * 60 # 18 hours min_array_size_for_time = max(1, int(np.ceil(total_minutes / max_minutes_per_job))) diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml index 2baabea..177cb61 100644 --- a/oellm/task-groups.yaml +++ b/oellm/task-groups.yaml @@ -79,3 +79,67 @@ task_groups: n_shots: [5] - task: belebele_swe_Latn n_shots: [5] + oellm-multilingual: + description: "Combined Belebele EU set plus multilingual benchmarks" + tasks: + - task: belebele_bul_Cyrl + n_shots: [5] + - task: belebele_hrv_Latn + n_shots: [5] + - task: belebele_ces_Latn + n_shots: [5] + - task: belebele_dan_Latn + n_shots: [5] + - task: belebele_nld_Latn + n_shots: [5] + - task: belebele_eng_Latn + n_shots: [5] + - task: belebele_est_Latn + n_shots: [5] + - task: belebele_fin_Latn + n_shots: [5] + - task: belebele_fra_Latn + n_shots: [5] + - task: belebele_deu_Latn + n_shots: [5] + - task: belebele_ell_Grek + n_shots: [5] + - task: belebele_hun_Latn + n_shots: [5] + - task: belebele_ita_Latn + n_shots: [5] + - task: belebele_lvs_Latn + n_shots: [5] + - task: belebele_lit_Latn + n_shots: [5] + - task: belebele_mlt_Latn + n_shots: [5] + - task: belebele_pol_Latn + n_shots: [5] + - task: belebele_por_Latn + n_shots: [5] + - task: belebele_ron_Latn + n_shots: [5] + - task: belebele_slk_Latn + n_shots: [5] + - task: belebele_slv_Latn + n_shots: [5] + - task: belebele_spa_Latn + n_shots: [5] + - task: belebele_swe_Latn + n_shots: [5] + - task: xwinograd + n_shots: [0] + - task: xcopa + n_shots: [0] + - task: xstorycloze + n_shots: [0] + - task: global_mmlu + n_shots: [0] + suite: lm_eval + - task: light_eval_benchmarks/flores-200-eu.txt + n_shots: [0] + suite: lighteval + - task: include + n_shots: [0] + suite: lm_eval diff --git a/oellm/template.sbatch b/oellm/template.sbatch index 34c95c3..a4f9317 100644 --- a/oellm/template.sbatch +++ b/oellm/template.sbatch @@ -56,12 +56,13 @@ fi # Use `tail` and `head` to slice the CSV file for the tasks assigned to this job. # The +1 on START_INDEX accounts for the header row. tail -n +$((START_INDEX + 1)) "$CSV_PATH" | head -n $((END_INDEX - START_INDEX + 1)) | \ -while IFS=, read -r model_path task_path n_shot +while IFS=, read -r model_path task_path n_shot eval_suite do # Remove trailing carriage returns if script is edited on Windows model_path=$(echo "$model_path" | tr -d '\r') task_path=$(echo "$task_path" | tr -d '\r') n_shot=$(echo "$n_shot" | tr -d '\r') + eval_suite=$(echo "${eval_suite:-lm_eval}" | tr -d '\r') # Skip empty lines if [ -z "$model_path" ]; then @@ -73,6 +74,7 @@ do echo " Model: $model_path" echo " Task: $task_path" echo " N-shot: $n_shot" + echo " Suite: $eval_suite" echo "----------------------------------------------------" # Build bind paths: always mount the shared eval directory, and additionally @@ -91,16 +93,54 @@ do fi fi - - singularity exec $SINGULARITY_ARGS \ - --bind $BIND_PATHS \ - $EVAL_SIF_PATH \ - python -m lm_eval --model hf \ - --model_args pretrained="$model_path",trust_remote_code=True \ - --tasks "$task_path" \ - --num_fewshot "$n_shot" \ - --output_path "{evals_dir}/$(openssl rand -hex 5).json" \ - --trust_remote_code + suite_normalized=$(echo "$eval_suite" | tr '[:upper:]' '[:lower:]') + + case "$suite_normalized" in + lm_eval|lm-eval|lm-eval-harness) + singularity exec $SINGULARITY_ARGS \ + --bind $BIND_PATHS \ + $EVAL_SIF_PATH \ + python -m lm_eval --model hf \ + --model_args pretrained="$model_path",trust_remote_code=True \ + --tasks "$task_path" \ + --num_fewshot "$n_shot" \ + --output_path "{evals_dir}/$(openssl rand -hex 5).json" \ + --trust_remote_code + ;; + lighteval|light-eval) + LIGHT_TASK="$task_path" + + if [[ -f "$LIGHT_TASK" ]]; then + LIGHT_TASK_ARG="$LIGHT_TASK" + else + last_segment="${LIGHT_TASK##*|}" + if [[ "$LIGHT_TASK" == *"|"* && "$last_segment" =~ ^[0-9]+$ ]]; then + if [[ -n "$n_shot" && "$last_segment" != "$n_shot" ]]; then + LIGHT_TASK_ARG="${LIGHT_TASK%|*}|$n_shot" + else + LIGHT_TASK_ARG="$LIGHT_TASK" + fi + else + LIGHT_TASK_ARG="${LIGHT_TASK}|$n_shot" + fi + fi + + RESULTS_SUBDIR="{evals_dir}/$(openssl rand -hex 5)" + mkdir -p "$RESULTS_SUBDIR" + + singularity exec $SINGULARITY_ARGS \ + --bind $BIND_PATHS \ + $EVAL_SIF_PATH \ + lighteval accelerate \ + "model_name=$model_path,trust_remote_code=True" \ + "$LIGHT_TASK_ARG" \ + --output_dir "$RESULTS_SUBDIR" \ + --save_details + ;; + *) + echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping." + ;; + esac echo "Evaluation finished for model: $model_path" From a8104fc637440c3eee3cfb0d99cd7b16ae2bb60c Mon Sep 17 00:00:00 2001 From: "Timur M. Carstensen" <40788422+timurcarstensen@users.noreply.github.com> Date: Tue, 14 Oct 2025 18:33:10 +0200 Subject: [PATCH 02/39] Remove mypy pre-commit hook --- .pre-commit-config.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3de1803..94ccd57 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,9 +19,3 @@ repos: args: [--fix, --exit-non-zero-on-fix] - id: ruff-format - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.8.0 - hooks: - - id: mypy - additional_dependencies: [types-all] - args: [--ignore-missing-imports] \ No newline at end of file From 3e9a6b62d2521e7b12e704deb3ebc7298e0bd043 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Sun, 19 Oct 2025 22:24:27 +0300 Subject: [PATCH 03/39] chore: remove tests --- tests/__init__.py | 0 tests/test_expand_local_model_paths.py | 189 -------- tests/test_interactive_csv_builder.py | 597 ------------------------- 3 files changed, 786 deletions(-) delete mode 100644 tests/__init__.py delete mode 100644 tests/test_expand_local_model_paths.py delete mode 100644 tests/test_interactive_csv_builder.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_expand_local_model_paths.py b/tests/test_expand_local_model_paths.py deleted file mode 100644 index a913c47..0000000 --- a/tests/test_expand_local_model_paths.py +++ /dev/null @@ -1,189 +0,0 @@ -import tempfile -from pathlib import Path - -import pytest - -from oellm.main import _expand_local_model_paths - - -class TestExpandLocalModelPaths: - """Test suite for the _expand_local_model_paths function.""" - - @pytest.fixture - def temp_dir(self): - """Create a temporary directory for testing.""" - with tempfile.TemporaryDirectory() as tmpdir: - yield Path(tmpdir) - - def create_safetensors_file(self, path: Path, name: str = "model.safetensors"): - """Helper to create a dummy safetensors file.""" - path.mkdir(parents=True, exist_ok=True) - (path / name).touch() - - def test_single_model_directory(self, temp_dir): - """Test a directory containing safetensors files directly.""" - model_dir = temp_dir / "model" - self.create_safetensors_file(model_dir) - - result = _expand_local_model_paths(str(model_dir)) - - assert len(result) == 1 - assert result[0] == model_dir - - def test_model_with_hf_checkpoints(self, temp_dir): - """Test a model with hf/iter_* checkpoint structure.""" - model_dir = temp_dir / "model" - - # Create checkpoint structure - checkpoint1 = model_dir / "hf" / "iter_0001000" - checkpoint2 = model_dir / "hf" / "iter_0002000" - checkpoint3 = model_dir / "hf" / "iter_0003000" - - self.create_safetensors_file(checkpoint1) - self.create_safetensors_file(checkpoint2) - self.create_safetensors_file(checkpoint3) - - result = _expand_local_model_paths(str(model_dir)) - - assert len(result) == 3 - assert checkpoint1 in result - assert checkpoint2 in result - assert checkpoint3 in result - - def test_directory_with_iteration_subdirs(self, temp_dir): - """Test a directory directly containing iter_* subdirectories.""" - model_dir = temp_dir / "model_a" - - # Create iteration directories directly under model_a - iter1 = model_dir / "iter_0001000" - iter2 = model_dir / "iter_0002000" - iter3 = model_dir / "iter_0003000" - - self.create_safetensors_file(iter1) - self.create_safetensors_file(iter2) - self.create_safetensors_file(iter3) - - result = _expand_local_model_paths(str(model_dir)) - - assert len(result) == 3 - assert iter1 in result - assert iter2 in result - assert iter3 in result - - def test_directory_with_multiple_models(self, temp_dir): - """Test a directory containing multiple model subdirectories.""" - parent_dir = temp_dir / "converted_checkpoints" - - # Create multiple models - model1 = parent_dir / "open-sci-ref_model-0.13b_data-c4" - model2 = parent_dir / "open-sci-ref_model-0.35b_data-c4" - - self.create_safetensors_file(model1) - self.create_safetensors_file(model2) - - result = _expand_local_model_paths(str(parent_dir)) - - assert len(result) == 2 - assert model1 in result - assert model2 in result - - def test_multiple_models_with_checkpoints(self, temp_dir): - """Test multiple models each with their own checkpoints.""" - parent_dir = temp_dir / "models" - - # Model 1 with checkpoints - model1_checkpoint1 = parent_dir / "model1" / "hf" / "iter_1000" - model1_checkpoint2 = parent_dir / "model1" / "hf" / "iter_2000" - - # Model 2 with checkpoints - model2_checkpoint1 = parent_dir / "model2" / "hf" / "iter_1000" - model2_checkpoint2 = parent_dir / "model2" / "hf" / "iter_2000" - - self.create_safetensors_file(model1_checkpoint1) - self.create_safetensors_file(model1_checkpoint2) - self.create_safetensors_file(model2_checkpoint1) - self.create_safetensors_file(model2_checkpoint2) - - result = _expand_local_model_paths(str(parent_dir)) - - assert len(result) == 4 - assert model1_checkpoint1 in result - assert model1_checkpoint2 in result - assert model2_checkpoint1 in result - assert model2_checkpoint2 in result - - def test_empty_directory(self, temp_dir): - """Test an empty directory returns no models.""" - empty_dir = temp_dir / "empty" - empty_dir.mkdir() - - result = _expand_local_model_paths(str(empty_dir)) - - assert len(result) == 0 - - def test_non_existent_directory(self, temp_dir): - """Test a non-existent directory returns no models.""" - non_existent = temp_dir / "does_not_exist" - - result = _expand_local_model_paths(str(non_existent)) - - assert len(result) == 0 - - def test_directory_with_non_model_files(self, temp_dir): - """Test a directory with files but no safetensors.""" - dir_with_files = temp_dir / "not_a_model" - dir_with_files.mkdir() - (dir_with_files / "readme.txt").touch() - (dir_with_files / "config.json").touch() - - result = _expand_local_model_paths(str(dir_with_files)) - - assert len(result) == 0 - - def test_mixed_structure(self, temp_dir): - """Test a directory with mixed structure (some models, some checkpoints).""" - parent_dir = temp_dir / "mixed" - - # Direct model - direct_model = parent_dir / "direct_model" - self.create_safetensors_file(direct_model) - - # Model with checkpoints - checkpoint_model = parent_dir / "checkpoint_model" / "hf" / "iter_1000" - self.create_safetensors_file(checkpoint_model) - - # Empty directory - (parent_dir / "empty_dir").mkdir(parents=True) - - # Non-model files - (parent_dir / "readme.txt").touch() - - result = _expand_local_model_paths(str(parent_dir)) - - assert len(result) == 2 - assert direct_model in result - assert checkpoint_model in result - - def test_file_instead_of_directory(self, temp_dir): - """Test passing a file instead of a directory.""" - file_path = temp_dir / "file.txt" - file_path.touch() - - result = _expand_local_model_paths(str(file_path)) - - assert len(result) == 0 - - def test_symlinked_directory(self, temp_dir: Path): - """Test handling of symlinked directories.""" - # Create actual model directory - actual_model = temp_dir / "actual_model" - self.create_safetensors_file(actual_model) - - # Create symlink to model - symlink = temp_dir / "symlinked_model" - symlink.symlink_to(actual_model) - - result = _expand_local_model_paths(str(symlink)) - - assert len(result) == 1 - assert result[0] == symlink # Should return the symlink path, not the target diff --git a/tests/test_interactive_csv_builder.py b/tests/test_interactive_csv_builder.py deleted file mode 100644 index e070ea6..0000000 --- a/tests/test_interactive_csv_builder.py +++ /dev/null @@ -1,597 +0,0 @@ -import tempfile -from pathlib import Path -from unittest.mock import mock_open, patch - -import pandas as pd -import pytest -import yaml - -from oellm.interactive_csv_builder import build_csv_interactive - - -class TestInteractiveCSVBuilder: - """Test suite for the interactive CSV builder.""" - - @pytest.fixture - def temp_output_path(self): - """Create a temporary output path for testing.""" - with tempfile.NamedTemporaryFile(suffix=".csv", delete=True) as f: - temp_path = f.name - yield temp_path - # Cleanup - Path(temp_path).unlink(missing_ok=True) - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_basic_csv_creation( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test basic CSV creation with one model and one task.""" - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", # Choose to add a model - "βœ… Continue to tasks", # Continue to tasks - "βž• Add a single task", # Add a task - "0 (zero-shot)", # Choose n_shot value - "βœ… Continue to preview", # Continue to preview - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", # Enter model name - "test-task", # Enter task name - ] - - mock_confirm.return_value.ask.return_value = True # Confirm save - - # Run the builder - build_csv_interactive(temp_output_path) - - # Verify CSV was created - assert Path(temp_output_path).exists() - - # Load and verify content - df = pd.read_csv(temp_output_path) - assert len(df) == 1 - assert df.iloc[0]["model_path"] == "test-model" - assert df.iloc[0]["task_path"] == "test-task" - assert df.iloc[0]["n_shot"] == 0 - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_multiple_models_and_tasks( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test CSV creation with multiple models and tasks.""" - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "0,5 (both)", # Multiple n_shot values - "βž• Add a single task", - "5 (few-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "model1", - "meta-llama/Llama-2-7b-hf", - "task1", - "task2", - ] - - mock_confirm.return_value.ask.return_value = True - - # Run the builder - build_csv_interactive(temp_output_path) - - # Load and verify content - df = pd.read_csv(temp_output_path) - assert len(df) == 6 # 2 models Γ— (2 n_shots for task1 + 1 n_shot for task2) - - # Check all combinations exist - assert set(df["model_path"].unique()) == {"model1", "meta-llama/Llama-2-7b-hf"} - assert set(df["task_path"].unique()) == {"task1", "task2"} - - # Check n_shot values for task1 - task1_df = df[df["task_path"] == "task1"] - assert set(task1_df["n_shot"].unique()) == {0, 5} - - # Check n_shot values for task2 - task2_df = df[df["task_path"] == "task2"] - assert set(task2_df["n_shot"].unique()) == {5} - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_custom_n_shot_values( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test custom n_shot value input.""" - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "πŸ“ Custom values", # Choose custom n_shot - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task", - "0,3,7,15", # Custom n_shot values - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - df = pd.read_csv(temp_output_path) - assert len(df) == 4 - assert set(df["n_shot"].unique()) == {0, 3, 7, 15} - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_local_path_model( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test adding a model via local path.""" - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "/path/to/local/model", # Enter local path as model - "test-task", # Enter task name - ] - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - df = pd.read_csv(temp_output_path) - assert df.iloc[0]["model_path"] == "/path/to/local/model" - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_user_cancellation( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test user cancellation at various points.""" - # Test cancellation during model input - mock_select.return_value.ask.return_value = None # Simulate Ctrl+C - - build_csv_interactive(temp_output_path) - - # CSV should not be created - assert not Path(temp_output_path).exists() - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_no_save_confirmation( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test when user chooses not to save.""" - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task", - ] - - mock_confirm.return_value.ask.return_value = False # Don't save - - build_csv_interactive(temp_output_path) - - # CSV should not be created - assert not Path(temp_output_path).exists() - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_invalid_n_shot_values( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test handling of invalid n_shot values.""" - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "πŸ“ Custom values", - "βž• Add a single task", # Add another task after invalid input - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task1", - "invalid,values", # Invalid n_shot values - "test-task2", - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - df = pd.read_csv(temp_output_path) - # Only the second task should be in the CSV - assert len(df) == 1 - assert df.iloc[0]["task_path"] == "test-task2" - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - def test_view_current_models_and_tasks( - self, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test viewing current models and tasks functionality.""" - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "πŸ“‹ View current models", # View models - "βœ… Continue to tasks", - "βž• Add a single task", - "0 (zero-shot)", - "πŸ“‹ View current tasks", # View tasks - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task", - ] - - mock_confirm.return_value.ask.return_value = True - - # This should run without errors - build_csv_interactive(temp_output_path) - - df = pd.read_csv(temp_output_path) - assert len(df) == 1 - - def test_output_directory_creation(self): - """Test that output directory is created if it doesn't exist.""" - with tempfile.TemporaryDirectory() as tmpdir: - nested_path = Path(tmpdir) / "nested" / "dir" / "output.csv" - - with patch( - "oellm.interactive_csv_builder.questionary.select" - ) as mock_select, patch( - "oellm.interactive_csv_builder.questionary.text" - ) as mock_text, patch( - "oellm.interactive_csv_builder.questionary.confirm" - ) as mock_confirm: - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task", - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(str(nested_path)) - - # Check that directory was created - assert nested_path.parent.exists() - assert nested_path.exists() - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.checkbox") - @patch("oellm.interactive_csv_builder.questionary.confirm") - @patch("pathlib.Path.exists") - @patch("builtins.open", new_callable=mock_open) - def test_single_task_group_selection( - self, - mock_file, - mock_exists, - mock_confirm, - mock_checkbox, - mock_select, - temp_output_path, - ): - """Test selecting a single task group.""" - # Mock YAML content - yaml_content = { - "task_groups": { - "open-sci-default": { - "description": "Default OpenEuroLLM scientific tasks", - "tasks": [ - {"task": "copa", "n_shots": [0]}, - {"task": "openbookqa", "n_shots": [0]}, - {"task": "mmlu", "n_shots": [5]}, - ], - } - } - } - mock_file.return_value.read.return_value = yaml.dump(yaml_content) - mock_exists.return_value = True - - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "πŸ“¦ Use a default task group", - "βœ… Continue to preview", # After adding task groups (line 201-208) - ] - - mock_checkbox.return_value.ask.return_value = [ - "open-sci-default - Default OpenEuroLLM scientific tasks" - ] - - # Mock text input for model - with patch("oellm.interactive_csv_builder.questionary.text") as mock_text: - mock_text.return_value.ask.return_value = "test-model" - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - # Verify CSV was created with correct content - df = pd.read_csv(temp_output_path) - assert len(df) == 3 # 3 tasks from the group - assert set(df["task_path"]) == {"copa", "openbookqa", "mmlu"} - assert df[df["task_path"] == "copa"]["n_shot"].values[0] == 0 - assert df[df["task_path"] == "mmlu"]["n_shot"].values[0] == 5 - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.checkbox") - @patch("oellm.interactive_csv_builder.questionary.confirm") - @patch("pathlib.Path.exists") - @patch("builtins.open", new_callable=mock_open) - def test_multiple_task_groups_selection( - self, - mock_file, - mock_exists, - mock_confirm, - mock_checkbox, - mock_select, - temp_output_path, - ): - """Test selecting multiple task groups.""" - # Mock YAML content with multiple groups - yaml_content = { - "task_groups": { - "group1": { - "description": "First group", - "tasks": [ - {"task": "task1", "n_shots": [0]}, - {"task": "task2", "n_shots": [5]}, - ], - }, - "group2": { - "description": "Second group", - "tasks": [ - {"task": "task3", "n_shots": [0, 5]}, - {"task": "task4", "n_shots": [10]}, - ], - }, - } - } - mock_file.return_value.read.return_value = yaml.dump(yaml_content) - mock_exists.return_value = True - - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "πŸ“¦ Use a default task group", - "βœ… Continue to preview", # After adding task groups (line 201-208) - ] - - mock_checkbox.return_value.ask.return_value = [ - "group1 - First group", - "group2 - Second group", - ] - - # Mock text input for model - with patch("oellm.interactive_csv_builder.questionary.text") as mock_text: - mock_text.return_value.ask.return_value = "test-model" - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - # Verify CSV was created with correct content - df = pd.read_csv(temp_output_path) - assert len(df) == 5 # 2 + 3 (task3 has 2 n_shots) - assert set(df["task_path"]) == {"task1", "task2", "task3", "task4"} - - # Check n_shot values - assert df[df["task_path"] == "task1"]["n_shot"].values[0] == 0 - assert df[df["task_path"] == "task2"]["n_shot"].values[0] == 5 - assert set(df[df["task_path"] == "task3"]["n_shot"].values) == {0, 5} - assert df[df["task_path"] == "task4"]["n_shot"].values[0] == 10 - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.checkbox") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - @patch("pathlib.Path.exists") - @patch("builtins.open", new_callable=mock_open) - def test_task_group_no_selection( - self, - mock_file, - mock_exists, - mock_confirm, - mock_text, - mock_checkbox, - mock_select, - temp_output_path, - ): - """Test when user opens task group menu but doesn't select any.""" - # Mock YAML content - yaml_content = { - "task_groups": { - "group1": { - "description": "Test group", - "tasks": [{"task": "task1", "n_shots": [0]}], - } - } - } - mock_file.return_value.read.return_value = yaml.dump(yaml_content) - mock_exists.return_value = True - - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "πŸ“¦ Use a default task group", - "βž• Add a single task", # Continue to add single task after no selection - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_checkbox.return_value.ask.return_value = [] # No groups selected - - mock_text.return_value.ask.side_effect = [ - "test-model", - "manual-task", - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - # Verify CSV only contains the manually added task - df = pd.read_csv(temp_output_path) - assert len(df) == 1 - assert df["task_path"].values[0] == "manual-task" - assert df["n_shot"].values[0] == 0 - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - @patch("pathlib.Path.exists") - def test_task_group_yaml_not_found( - self, mock_exists, mock_confirm, mock_text, mock_select, temp_output_path - ): - """Test behavior when task-groups.yaml file doesn't exist.""" - # Mock that the YAML file doesn't exist - mock_exists.return_value = False - - # Mock user interactions - no task group option should appear - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "βž• Add a single task", # No task group option available - "0 (zero-shot)", - "βœ… Continue to preview", - ] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "test-task", - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - # Verify CSV was created with manually added task - df = pd.read_csv(temp_output_path) - assert len(df) == 1 - assert df["model_path"].values[0] == "test-model" - assert df["task_path"].values[0] == "test-task" - assert df["n_shot"].values[0] == 0 - - @patch("oellm.interactive_csv_builder.questionary.select") - @patch("oellm.interactive_csv_builder.questionary.checkbox") - @patch("oellm.interactive_csv_builder.questionary.text") - @patch("oellm.interactive_csv_builder.questionary.confirm") - @patch("pathlib.Path.exists") - @patch("builtins.open", new_callable=mock_open) - def test_task_group_combined_with_individual_tasks( - self, - mock_file, - mock_exists, - mock_confirm, - mock_text, - mock_checkbox, - mock_select, - temp_output_path, - ): - """Test combining task groups with individually added tasks.""" - # Mock YAML content - yaml_content = { - "task_groups": { - "small-group": { - "description": "Small test group", - "tasks": [ - {"task": "group-task1", "n_shots": [0]}, - {"task": "group-task2", "n_shots": [5]}, - ], - } - } - } - mock_file.return_value.read.return_value = yaml.dump(yaml_content) - mock_exists.return_value = True - - # Mock user interactions - mock_select.return_value.ask.side_effect = [ - "βž• Add a model", - "βœ… Continue to tasks", - "πŸ“¦ Use a default task group", - "βž• Add more tasks", # Choose to add more after task group - "βž• Add a single task", - "0,5 (both)", - "βž• Add a single task", - "πŸ“ Custom values", - "βœ… Continue to preview", - ] - - mock_checkbox.return_value.ask.return_value = ["small-group - Small test group"] - - mock_text.return_value.ask.side_effect = [ - "test-model", - "individual-task1", - "individual-task2", - "0,10,25", # Custom n_shot values - ] - - mock_confirm.return_value.ask.return_value = True - - build_csv_interactive(temp_output_path) - - # Verify CSV contains both group tasks and individual tasks - df = pd.read_csv(temp_output_path) - - # Should have: 2 group tasks + 2 individual-task1 n_shots + 3 individual-task2 n_shots = 7 - assert len(df) == 7 - - # Check all tasks are present - assert set(df["task_path"]) == { - "group-task1", - "group-task2", - "individual-task1", - "individual-task2", - } - - # Verify n_shot values for each task - assert df[df["task_path"] == "group-task1"]["n_shot"].values[0] == 0 - assert df[df["task_path"] == "group-task2"]["n_shot"].values[0] == 5 - assert set(df[df["task_path"] == "individual-task1"]["n_shot"].values) == {0, 5} - assert set(df[df["task_path"] == "individual-task2"]["n_shot"].values) == { - 0, - 10, - 25, - } From 1a0b16ac4d2e4783ac506477df455ccb66cea06b Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Sun, 19 Oct 2025 22:25:15 +0300 Subject: [PATCH 04/39] fix: lighteval integration --- oellm/main.py | 633 +++++++++++++++++------------------------ oellm/task-groups.yaml | 123 +++++--- oellm/template.sbatch | 11 +- pyproject.toml | 6 +- 4 files changed, 351 insertions(+), 422 deletions(-) diff --git a/oellm/main.py b/oellm/main.py index 94c82ad..72679e8 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -19,7 +19,7 @@ from rich.logging import RichHandler -def ensure_singularity_image(image_name: str) -> None: +def _ensure_singularity_image(image_name: str) -> None: # TODO: switch to OELLM dataset repo once it is created from huggingface_hub import hf_hub_download @@ -379,9 +379,131 @@ def _pre_download_task_datasets( logging.debug(f"Finished dataset preparation for task '{task_name}'.") +def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: + """Pre-download LightEval datasets by instantiating tasks via the local LightEval Registry.""" + import sys + + local_le_src = Path(__file__).parent.parent / "lighteval" / "src" + if local_le_src.exists(): + sys.path.insert(0, str(local_le_src)) + + from lighteval.tasks.registry import Registry, TRUNCATE_FEW_SHOTS_DEFAULTS # type: ignore + from lighteval.tasks.lighteval_task import LightevalTask # type: ignore + + file_task_specs: list[str] = [] + string_task_specs: list[str] = [] + + for t in tasks: + raw = str(t).strip() + if not raw: + continue + candidate = Path(raw) + if candidate.exists() and candidate.is_file(): + file_task_specs.append(str(candidate)) + else: + spec = raw + truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) + if "|" not in spec: + spec = f"lighteval|{spec}|0|{truncate_default}" + elif spec.count("|") == 1: + spec = f"{spec}|0|{truncate_default}" + elif spec.count("|") == 2: + spec = f"{spec}|{truncate_default}" + string_task_specs.append(spec) + + unique_string_specs = sorted(set(string_task_specs)) + unique_file_specs = sorted(set(file_task_specs)) + + if unique_string_specs: + reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") + configs = reg.get_tasks_configs(",".join(unique_string_specs)) + task_dict = reg.get_tasks_from_configs(configs) + LightevalTask.load_datasets(task_dict) + + for fp in unique_file_specs: + reg_file = Registry() + configs_file = reg_file.get_tasks_configs(fp) + task_dict_file = reg_file.get_tasks_from_configs(configs_file) + LightevalTask.load_datasets(task_dict_file) + +def _load_task_groups() -> dict[str, dict]: + """Load task groups from `task-groups.yaml` located next to this module.""" + groups_file = Path(__file__).parent / "task-groups.yaml" + if not groups_file.exists(): + raise ValueError(f"Task groups file not found: {groups_file}") + + with open(groups_file) as f: + data = yaml.safe_load(f) or {} + + groups = data.get("task_groups") or {} + if not isinstance(groups, dict): + raise ValueError("Invalid task groups format in task-groups.yaml") + + return groups + + +def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]: + """ + Expand task group names into concrete (task, n_shots, suite) tuples. + + Supports nested groups. Defaults: suite=lm_eval, n_shots=[0] when absent. + A group's `suite` (if present) is inherited by its items and nested groups + unless a leaf explicitly overrides it. + """ + groups = _load_task_groups() + resolved: list[tuple[str, list[int], str]] = [] + + def expand_group(group_name: str, stack: set[str], inherited_suite: str | None = None) -> None: + if group_name not in groups: + raise ValueError(f"Unknown task group: {group_name}") + if group_name in stack: + raise ValueError(f"Cyclic task group reference detected at '{group_name}'") + + stack.add(group_name) + group_default_suite = groups[group_name].get("suite") + effective_inherited_suite = inherited_suite if inherited_suite is not None else group_default_suite + + for item in groups[group_name].get("tasks", []): + task_identifier = str(item.get("task")) + # Prefer explicit suite on the item; otherwise inherit; otherwise default to lm_eval + item_suite = item.get("suite") + suite_name = ( + str(item_suite) + if item_suite is not None + else (str(effective_inherited_suite) if effective_inherited_suite is not None else "lm_eval") + ) + n_shots_value = item.get("n_shots") + + # Nested group reference: propagate the resolved suite + if task_identifier in groups: + next_inherited = str(item_suite) if item_suite is not None else effective_inherited_suite + # Pass down only an inherited suite (or explicit item override) without defaulting to "lm_eval", + # so that the child group's own default `suite` can take effect if present. + expand_group(task_identifier, stack, next_inherited) + continue + + # Leaf task + if not isinstance(n_shots_value, list): + n_shots: list[int] = [0] + else: + # Ensure ints + n_shots = [int(x) for x in n_shots_value] + + resolved.append((task_identifier, n_shots, suite_name)) + stack.remove(group_name) + + for raw_name in group_names: + name = str(raw_name).strip() + if not name: + continue + expand_group(name, set(), None) + + return resolved + def schedule_evals( models: str | None = None, tasks: str | None = None, + task_groups: str | None = None, n_shot: int | list[int] | None = None, eval_csv_path: str | None = None, *, @@ -405,10 +527,13 @@ def schedule_evals( all models in subdirectories will be automatically discovered - For each model directory, if it has an `hf/iter_XXXXX` structure, all checkpoints will be expanded - This allows passing a single directory containing multiple models to evaluate them all - tasks: A string of comma-separated task paths. - n_shot: An integer or list of integers specifying the number of shots for each task. + tasks: A string of comma-separated task names (lm_eval) or paths. + Requires `n_shot` to be provided. Tasks here are assumed to be lm_eval unless otherwise handled via CSV. + task_groups: A string of comma-separated task group names defined in `task-groups.yaml`. + Each group expands into concrete (task, n_shots, suite) entries; `n_shot` is ignored for groups. + n_shot: An integer or list of integers specifying the number of shots applied to `tasks`. eval_csv_path: A path to a CSV file containing evaluation data. - Warning: exclusive argument. Cannot specify `models`, `tasks`, or `n_shot` when `eval_csv_path` is provided. + Warning: exclusive argument. Cannot specify `models`, `tasks`, `task_groups`, or `n_shot` when `eval_csv_path` is provided. max_array_len: The maximum number of jobs to schedule to run concurrently. Warning: this is not the number of jobs in the array job. This is determined by the environment variable `QUEUE_LIMIT`. download_only: If True, only download the datasets and models and exit. @@ -428,14 +553,15 @@ def schedule_evals( "EVAL_CONTAINER_IMAGE is not set. Please set it in clusters.yaml." ) - ensure_singularity_image(image_name) + _ensure_singularity_image(image_name) else: logging.info("Skipping container image check (--skip-checks enabled)") + if eval_csv_path: - if models or tasks or n_shot: + if models or tasks or task_groups or n_shot: raise ValueError( - "Cannot specify `models`, `tasks`, or `n_shot` when `eval_csv_path` is provided." + "Cannot specify `models`, `tasks`, `task_groups`, or `n_shot` when `eval_csv_path` is provided." ) df = pd.read_csv(eval_csv_path) required_cols = {"model_path", "task_path", "n_shot"} @@ -484,10 +610,9 @@ def schedule_evals( logging.info( "Skipping model path processing and validation (--skip-checks enabled)" ) - - elif models and tasks and n_shot is not None: - model_list = models.split(",") - model_paths = [] + elif models and ((tasks and n_shot is not None) or task_groups): + model_list = [m.strip() for m in models.split(",") if m.strip()] + model_paths: list[Path | str] = [] # Always expand local paths for model in model_list: @@ -512,21 +637,50 @@ def schedule_evals( "Skipping model path processing and validation (--skip-checks enabled)" ) - tasks_list = tasks.split(",") + rows: list[dict[str, Path | str | int]] = [] - # cross product of model_paths and tasks into a dataframe - df = pd.DataFrame( - product( - model_paths, - tasks_list, - n_shot if isinstance(n_shot, list) else [n_shot], - ), - columns=["model_path", "task_path", "n_shot"], - ) - df["eval_suite"] = "lm_eval" + # Handle explicit tasks (lm_eval) with provided n_shot + if tasks: + if n_shot is None: + raise ValueError( + "When specifying `tasks`, you must also provide `n_shot`. For task groups, use `task_groups`." + ) + tasks_list = [t.strip() for t in tasks.split(",") if t.strip()] + shots: list[int] + shots = n_shot if isinstance(n_shot, list) else [int(n_shot)] + for model_path in model_paths: + for task_name in tasks_list: + for s in shots: + rows.append( + { + "model_path": model_path, + "task_path": task_name, + "n_shot": int(s), + "eval_suite": "lm_eval", + } + ) + + # Handle task groups + if task_groups: + group_names = [g.strip() for g in task_groups.split(",") if g.strip()] + # import pdb; pdb.set_trace() + expanded = _expand_task_groups(group_names) + for model_path in model_paths: + for task_name, n_shots, suite_name in expanded: + for s in n_shots: + rows.append( + { + "model_path": model_path, + "task_path": task_name, + "n_shot": int(s), + "eval_suite": suite_name, + } + ) + + df = pd.DataFrame(rows, columns=["model_path", "task_path", "n_shot", "eval_suite"]) else: raise ValueError( - "Either `eval_csv_path` must be provided, or all of `models`, `tasks`, and `n_shot`." + "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`." ) if df.empty: @@ -543,6 +697,12 @@ def schedule_evals( _pre_download_task_datasets( lm_eval_tasks, trust_remote_code=trust_remote_code ) + # Pre-download LightEval datasets (best-effort, incremental support) + light_eval_tasks = df[ + df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) + ]["task_path"].unique() + if len(light_eval_tasks) > 0: + _pre_download_lighteval_datasets(light_eval_tasks) else: logging.info("Skipping dataset pre-download (--skip-checks enabled)") @@ -800,7 +960,6 @@ def collect_results( output_csv: str = "eval_results.csv", *, check: bool = False, - reschedule: bool = False, verbose: bool = False, ) -> None: """ @@ -809,16 +968,12 @@ def collect_results( Args: results_dir: Path to the directory containing result JSON files output_csv: Output CSV filename (default: eval_results.csv) - check: Check for crashed or pending evaluations - reschedule: Show overview table and prompt to reschedule failed/pending jobs + check: Check for missing evaluations and create a missing jobs CSV verbose: Enable verbose logging """ import json - from rich.table import Table - _setup_logging(verbose) - console = Console() results_path = Path(results_dir) if not results_path.exists(): @@ -839,13 +994,12 @@ def collect_results( logging.info(f"Found {len(json_files)} result files") - # If check or reschedule mode, also load the jobs.csv to compare - if check or reschedule: + # If check mode, also load the jobs.csv to compare + if check: jobs_csv_path = results_path / "jobs.csv" if not jobs_csv_path.exists(): logging.warning(f"No jobs.csv found in {results_dir}, cannot perform check") check = False - reschedule = False else: jobs_df = pd.read_csv(jobs_csv_path) logging.info(f"Found {len(jobs_df)} scheduled jobs in jobs.csv") @@ -853,72 +1007,62 @@ def collect_results( # Collect results rows = [] completed_jobs = set() # Track (model, task, n_shot) tuples - results_with_performance = ( - 0 # Track how many results actually have performance data - ) for json_file in json_files: - try: - with open(json_file) as f: - data = json.load(f) - - # Extract model name/path - model_name = data.get("model_name", "unknown") - - # Extract results for each task - results = data.get("results", {}) - n_shot_data = data.get("n-shot", {}) - - for task_name, task_results in results.items(): - # Skip MMLU subtasks - only keep the aggregate score - if task_name.startswith("mmlu_") and task_name != "mmlu": - continue - - # Get n_shot for this task - n_shot = n_shot_data.get(task_name, "unknown") - - # Special handling for MMLU aggregate - get n_shot from any MMLU subtask - if task_name == "mmlu" and n_shot == "unknown": - for key, value in n_shot_data.items(): - if key.startswith("mmlu_"): - n_shot = value - break - - # Get the primary metric (usually acc,none) - performance = task_results.get("acc,none") - if performance is None: - # Try other common metric names - for metric in ["acc", "accuracy", "f1", "exact_match"]: - if metric in task_results: - performance = task_results[metric] - break - - if performance is not None: - results_with_performance += 1 - - # Track completed job for check/reschedule mode (only if we have a result) - if check or reschedule: - completed_jobs.add((model_name, task_name, n_shot)) - - rows.append( - { - "model_name": model_name, - "task": task_name, - "n_shot": n_shot, - "performance": performance, - } + with open(json_file) as f: + data = json.load(f) + + # Extract model name/path + model_name = data.get("model_name", "unknown") + + # Extract results for each task + results = data.get("results", {}) + n_shot_data = data.get("n-shot", {}) + + for task_name, task_results in results.items(): + # Skip MMLU subtasks - only keep the aggregate score + if task_name.startswith("mmlu_") and task_name != "mmlu": + continue + + # Get n_shot for this task + n_shot = n_shot_data.get(task_name, "unknown") + + # Special handling for MMLU aggregate - get n_shot from any MMLU subtask + if task_name == "mmlu" and n_shot == "unknown": + for key, value in n_shot_data.items(): + if key.startswith("mmlu_"): + n_shot = value + break + + # Get the primary metric (usually acc,none) + performance = task_results.get("acc,none") + if performance is None: + # Try other common metric names + for metric in ["acc", "accuracy", "f1", "exact_match"]: + if metric in task_results: + performance = task_results[metric] + break + + if performance is not None: + # Track completed job for check mode + if check: + completed_jobs.add((model_name, task_name, n_shot)) + + rows.append( + { + "model_name": model_name, + "task": task_name, + "n_shot": n_shot, + "performance": performance, + } + ) + else: + # Debug: log cases where we have a task but no performance metric + if verbose: + logging.debug( + f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}" ) - else: - # Debug: log cases where we have a task but no performance metric - if verbose: - logging.debug( - f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}" - ) - except Exception as e: - logging.warning(f"Failed to process {json_file}: {e}") - if verbose: - logging.exception(e) if not rows and not check: logging.warning("No results extracted from JSON files") @@ -941,101 +1085,23 @@ def collect_results( ) # Perform check analysis if requested - if check or reschedule: + if check: logging.info("\n=== Evaluation Status Check ===") - # Parse SLURM logs to get more detailed status - slurm_logs_dir = results_path / "slurm_logs" - attempted_jobs = set() # Jobs that were attempted (started) - failed_jobs = set() # Jobs that crashed/failed - - if slurm_logs_dir.exists(): - # Parse .out files to find attempted jobs - for out_file in slurm_logs_dir.glob("*.out"): - try: - with open(out_file) as f: - content = f.read() - # Look for "Starting evaluation for:" patterns - import re - - pattern = r"Starting evaluation for:\s*\n\s*Model: (.+)\s*\n\s*Task: (.+)\s*\n\s*N-shot: (\d+)" - matches = re.findall(pattern, content) - for model, task, n_shot in matches: - attempted_jobs.add( - (model.strip(), task.strip(), int(n_shot.strip())) - ) - - # Check if job finished successfully - if "Job" in content and "finished." in content: - # This array job completed successfully - pass - else: - # Job might have crashed - check for specific patterns - if ( - "Traceback" in content - or "Error" in content - or "Exception" in content - ): - for model, task, n_shot in matches: - failed_jobs.add( - ( - model.strip(), - task.strip(), - int(n_shot.strip()), - ) - ) - except Exception as e: - logging.debug(f"Error parsing {out_file}: {e}") - - # Parse .err files for errors - for err_file in slurm_logs_dir.glob("*.err"): - try: - file_size = err_file.stat().st_size - if file_size > 0: # Non-empty error file - # Extract array task ID from filename - array_id_match = re.search(r"-(\d+)\.err$", err_file.name) - if array_id_match: - int(array_id_match.group(1)) - # Find corresponding .out file to get job details - out_file = err_file.with_suffix(".out") - if out_file.exists(): - with open(out_file) as f: - content = f.read() - pattern = r"Starting evaluation for:\s*\n\s*Model: (.+)\s*\n\s*Task: (.+)\s*\n\s*N-shot: (\d+)" - matches = re.findall(pattern, content) - for model, task, n_shot in matches: - failed_jobs.add( - ( - model.strip(), - task.strip(), - int(n_shot.strip()), - ) - ) - except Exception as e: - logging.debug(f"Error parsing {err_file}: {e}") - - # Categorize incomplete jobs - still_running_jobs = [] # Jobs that are likely still executing - never_attempted_jobs = [] - crashed_jobs = [] - needs_rerun_jobs = [] # Jobs that definitely need to be rescheduled - - # We know we have exactly len(completed_jobs) completed jobs with actual results - # The rest need to be categorized - len(completed_jobs) + # Find missing jobs + missing_jobs = [] for _, job in jobs_df.iterrows(): job_tuple = (job["model_path"], job["task_path"], job["n_shot"]) # Check if this job corresponds to one of our completed results - # Use the same matching logic as before but don't over-count is_completed = False - # Try to find a matching completed job + # Try exact matching first if job_tuple in completed_jobs: is_completed = True else: - # Try fuzzy matching + # Try fuzzy matching for model names for completed_job in completed_jobs: completed_model, completed_task, completed_n_shot = completed_job @@ -1050,206 +1116,33 @@ def collect_results( is_completed = True break - if is_completed: - continue # Skip completed jobs - - # Job is not completed, categorize it - if job_tuple in failed_jobs: - crashed_jobs.append(job) - needs_rerun_jobs.append(job) - elif job_tuple not in attempted_jobs: - never_attempted_jobs.append(job) - needs_rerun_jobs.append(job) # These likely need rescheduling too - else: - # Job was attempted but not completed and didn't crash - likely still running - still_running_jobs.append(job) - - needs_rerun_df = pd.DataFrame(needs_rerun_jobs) + if not is_completed: + missing_jobs.append(job) - # Calculate completed jobs based on the jobs.csv perspective - actual_completed_from_jobs = ( - len(jobs_df) - - len(still_running_jobs) - - len(crashed_jobs) - - len(never_attempted_jobs) - ) + completed_count = len(jobs_df) - len(missing_jobs) logging.info(f"\nTotal scheduled jobs: {len(jobs_df)}") - logging.info( - f"Completed jobs (from scheduled jobs): {actual_completed_from_jobs}" - ) - logging.info(f"Still running/pending: {len(still_running_jobs)}") - logging.info(f"Failed/Crashed jobs: {len(crashed_jobs)}") - logging.info(f"Never attempted: {len(never_attempted_jobs)}") - logging.info(f"Jobs needing reschedule: {len(needs_rerun_jobs)}") - - if verbose: - logging.info(f"Total CSV rows (results with performance data): {len(rows)}") + logging.info(f"Completed jobs: {completed_count}") + logging.info(f"Missing jobs: {len(missing_jobs)}") + + if len(missing_jobs) > 0: + missing_df = pd.DataFrame(missing_jobs) + missing_csv = output_csv.replace(".csv", "_missing.csv") + missing_df.to_csv(missing_csv, index=False) + logging.info(f"\nMissing jobs saved to: {missing_csv}") logging.info( - f"Unique completed jobs found in JSON files: {len(completed_jobs)}" + f"You can run these with: oellm schedule-eval --eval_csv_path {missing_csv}" ) - if len(completed_jobs) != actual_completed_from_jobs: - logging.info( - f"Note: {len(completed_jobs)} results found vs {actual_completed_from_jobs} jobs matched from schedule" - ) - - if len(needs_rerun_jobs) > 0: - if reschedule: - # Show overview table in reschedule mode - console.print("\n[bold cyan]πŸ”„ Jobs Needing Reschedule[/bold cyan]") - - # Create summary table - summary_table = Table( - show_header=True, header_style="bold magenta", box=box.ROUNDED - ) - summary_table.add_column("Status", style="bold") - summary_table.add_column("Count", justify="right", style="cyan") - - summary_table.add_row("βœ… Completed", str(actual_completed_from_jobs)) - summary_table.add_row("πŸƒ Still Running", str(len(still_running_jobs))) - summary_table.add_row("❌ Crashed", str(len(crashed_jobs))) - summary_table.add_row( - "⏭️ Never Attempted", str(len(never_attempted_jobs)) - ) - summary_table.add_row( - "[bold yellow]πŸ”„ Need Reschedule[/bold yellow]", - f"[bold yellow]{len(needs_rerun_jobs)}[/bold yellow]", - ) - - console.print(summary_table) - - # Show detailed table of jobs to reschedule - console.print("\n[bold cyan]πŸ“‹ Detailed Job List[/bold cyan]") - detail_table = Table( - show_header=True, header_style="bold magenta", box=box.ROUNDED - ) - detail_table.add_column("#", style="dim", width=4) - detail_table.add_column("Status", style="bold", width=15) - detail_table.add_column( - "Model", style="cyan", no_wrap=True, max_width=40 - ) - detail_table.add_column("Task", style="green", max_width=20) - detail_table.add_column("n_shot", justify="right", style="yellow") - - # Show first 20 rows - for idx, (_, job) in enumerate(needs_rerun_df.head(20).iterrows(), 1): - if ( - job["model_path"], - job["task_path"], - job["n_shot"], - ) in failed_jobs: - status = "[red]❌ CRASHED[/red]" - else: - status = "[yellow]⏭️ NOT ATTEMPTED[/yellow]" - - # Truncate long model paths for display - model_display = str(job["model_path"]) - if len(model_display) > 40: - model_display = "..." + model_display[-37:] - - detail_table.add_row( - str(idx), - status, - model_display, - str(job["task_path"]), - str(job["n_shot"]), - ) - - if len(needs_rerun_jobs) > 20: - detail_table.add_row("...", "...", "...", "...", "...") - console.print(detail_table) - console.print( - f"\n[dim]Showing 20 of {len(needs_rerun_jobs)} jobs[/dim]" + # Show some examples if verbose + if verbose and len(missing_jobs) > 0: + logging.info("\nExample missing jobs:") + for _i, (_, job) in enumerate(missing_df.head(5).iterrows()): + logging.info( + f" - {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}" ) - else: - console.print(detail_table) - - # Ask for confirmation - console.print( - f"\n[bold]Total jobs to reschedule: {len(needs_rerun_jobs)}[/bold]" - ) - - import questionary - from questionary import Style - - custom_style = Style( - [ - ("qmark", "fg:#673ab7 bold"), - ("question", "bold"), - ("answer", "fg:#f44336 bold"), - ("pointer", "fg:#673ab7 bold"), - ("highlighted", "fg:#673ab7 bold"), - ("selected", "fg:#cc5454"), - ] - ) - - save_and_schedule = questionary.confirm( - "\nSave failed jobs CSV and schedule re-evaluation?", - default=True, - style=custom_style, - ).ask() - - if save_and_schedule: - # Save the CSV - rerun_csv = output_csv.replace(".csv", "_needs_rerun.csv") - needs_rerun_df.to_csv(rerun_csv, index=False) - console.print(f"\n[green]βœ… Jobs saved to: {rerun_csv}[/green]") - - # Ask if they want to schedule now - schedule_now = questionary.confirm( - "\nSchedule these jobs now?", - default=True, - style=custom_style, - ).ask() - - if schedule_now: - console.print("\n[yellow]To schedule these jobs, run:[/yellow]") - console.print( - f"[bold cyan]oellm schedule-eval --eval_csv_path {rerun_csv}[/bold cyan]" - ) - - else: - # Original behavior for check mode - # Save jobs that need rescheduling - rerun_csv = output_csv.replace(".csv", "_needs_rerun.csv") - needs_rerun_df.to_csv(rerun_csv, index=False) - logging.info(f"\nJobs needing reschedule saved to: {rerun_csv}") - logging.info( - f"You can re-run these with: [bold cyan]oellm schedule-eval --eval_csv_path {rerun_csv}[/bold cyan]" - ) - - # Save crashed jobs separately if any - if crashed_jobs: - crashed_csv = output_csv.replace(".csv", "_crashed.csv") - pd.DataFrame(crashed_jobs).to_csv(crashed_csv, index=False) - logging.info(f"Crashed jobs specifically saved to: {crashed_csv}") - - # Show some examples if verbose - if verbose and len(needs_rerun_jobs) > 0: - logging.info("\nExample jobs needing reschedule:") - for _i, (_, job) in enumerate(needs_rerun_df.head(5).iterrows()): - if ( - job["model_path"], - job["task_path"], - job["n_shot"], - ) in failed_jobs: - status = "CRASHED" - else: - status = "NEVER ATTEMPTED" - logging.info( - f" - [{status}] {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}" - ) - if len(needs_rerun_jobs) > 5: - logging.info(f" ... and {len(needs_rerun_jobs) - 5} more") - - if still_running_jobs and verbose: - logging.info( - f"\nNote: {len(still_running_jobs)} jobs appear to still be running/pending." - ) - logging.info( - "These were attempted but haven't completed yet. Check SLURM queue status." - ) + if len(missing_jobs) > 5: + logging.info(f" ... and {len(missing_jobs) - 5} more") def main(): diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml index 177cb61..e108497 100644 --- a/oellm/task-groups.yaml +++ b/oellm/task-groups.yaml @@ -79,67 +79,100 @@ task_groups: n_shots: [5] - task: belebele_swe_Latn n_shots: [5] - oellm-multilingual: - description: "Combined Belebele EU set plus multilingual benchmarks" + flores-200-eu: + description: "Flores 200 EU tasks" + suite: lighteval tasks: - - task: belebele_bul_Cyrl - n_shots: [5] - - task: belebele_hrv_Latn - n_shots: [5] - - task: belebele_ces_Latn - n_shots: [5] - - task: belebele_dan_Latn - n_shots: [5] - - task: belebele_nld_Latn - n_shots: [5] - - task: belebele_eng_Latn + - task: flores200:bul_Cyrl-eng_Latn + n_shots: [0] + - task: flores200:ces_Latn-eng_Latn + n_shots: [0] + - task: flores200:dan_Latn-eng_Latn + n_shots: [0] + - task: flores200:deu_Latn-eng_Latn + n_shots: [0] + - task: flores200:ell_Grek-eng_Latn + n_shots: [0] + - task: flores200:est_Latn-eng_Latn + n_shots: [0] + - task: flores200:fin_Latn-eng_Latn + n_shots: [0] + - task: flores200:fra_Latn-eng_Latn + n_shots: [0] + - task: flores200:gle_Latn-eng_Latn + n_shots: [0] + - task: flores200:hrv_Latn-eng_Latn + n_shots: [0] + - task: flores200:hun_Latn-eng_Latn + n_shots: [0] + - task: flores200:ita_Latn-eng_Latn + n_shots: [0] + - task: flores200:lit_Latn-eng_Latn + n_shots: [0] + - task: flores200:lvs_Latn-eng_Latn + n_shots: [0] + - task: flores200:mlt_Latn-eng_Latn + n_shots: [0] + - task: flores200:nld_Latn-eng_Latn + n_shots: [0] + - task: flores200:pol_Latn-eng_Latn + n_shots: [0] + - task: flores200:por_Latn-eng_Latn + n_shots: [0] + - task: flores200:ron_Latn-eng_Latn + n_shots: [0] + - task: flores200:slk_Latn-eng_Latn + n_shots: [0] + - task: flores200:slv_Latn-eng_Latn + n_shots: [0] + - task: flores200:spa_Latn-eng_Latn + n_shots: [0] + - task: flores200:swe_Latn-eng_Latn + n_shots: [0] + global-mmlu-eu: + description: "Global MMLU EU tasks" + tasks: + - task: global_mmlu_full_cs n_shots: [5] - - task: belebele_est_Latn + - task: global_mmlu_full_de n_shots: [5] - - task: belebele_fin_Latn + - task: global_mmlu_full_el n_shots: [5] - - task: belebele_fra_Latn + - task: global_mmlu_full_en n_shots: [5] - - task: belebele_deu_Latn + - task: global_mmlu_full_es n_shots: [5] - - task: belebele_ell_Grek + - task: global_mmlu_full_fr n_shots: [5] - - task: belebele_hun_Latn + - task: global_mmlu_full_it n_shots: [5] - - task: belebele_ita_Latn + - task: global_mmlu_full_lt n_shots: [5] - - task: belebele_lvs_Latn + - task: global_mmlu_full_nl n_shots: [5] - - task: belebele_lit_Latn + - task: global_mmlu_full_pl n_shots: [5] - - task: belebele_mlt_Latn + - task: global_mmlu_full_pt n_shots: [5] - - task: belebele_pol_Latn + - task: global_mmlu_full_ro n_shots: [5] - - task: belebele_por_Latn + - task: global_mmlu_full_ru n_shots: [5] - - task: belebele_ron_Latn + - task: global_mmlu_full_sr n_shots: [5] - - task: belebele_slk_Latn + - task: global_mmlu_full_sv n_shots: [5] - - task: belebele_slv_Latn + - task: global_mmlu_full_tr n_shots: [5] - - task: belebele_spa_Latn + - task: global_mmlu_full_uk n_shots: [5] - - task: belebele_swe_Latn + - task: global_mmlu_full_he n_shots: [5] - - task: xwinograd - n_shots: [0] - - task: xcopa - n_shots: [0] - - task: xstorycloze - n_shots: [0] - - task: global_mmlu - n_shots: [0] - suite: lm_eval - - task: light_eval_benchmarks/flores-200-eu.txt - n_shots: [0] - suite: lighteval - - task: include - n_shots: [0] - suite: lm_eval + oellm-multilingual: + description: "Combined Belebele EU set plus multilingual benchmarks" + tasks: + # - task: belebele-eu-5-shot + # suite: lm_eval + - task: flores-200-eu + # - task: global-mmlu-eu + # suite: lm_eval diff --git a/oellm/template.sbatch b/oellm/template.sbatch index a4f9317..f4b4905 100644 --- a/oellm/template.sbatch +++ b/oellm/template.sbatch @@ -20,6 +20,7 @@ export HF_XET_CACHE="$HF_HOME/xet" export HF_ASSETS_CACHE="$HF_HOME/assets" export HUGGINGFACE_HUB_CACHE="$HF_HOME/hub" export HUGGINGFACE_ASSETS_CACHE="$HF_HOME/assets" +export HF_DATASETS_CACHE="$HF_HOME/datasets" export HF_HUB_OFFLINE=1 # Path to the shared Singularity image that contains all runtime deps @@ -62,7 +63,7 @@ do model_path=$(echo "$model_path" | tr -d '\r') task_path=$(echo "$task_path" | tr -d '\r') n_shot=$(echo "$n_shot" | tr -d '\r') - eval_suite=$(echo "${eval_suite:-lm_eval}" | tr -d '\r') + eval_suite=$(echo "${{eval_suite:-lm_eval}}" | tr -d '\r') # Skip empty lines if [ -z "$model_path" ]; then @@ -113,15 +114,15 @@ do if [[ -f "$LIGHT_TASK" ]]; then LIGHT_TASK_ARG="$LIGHT_TASK" else - last_segment="${LIGHT_TASK##*|}" + last_segment="${{LIGHT_TASK##*|}}" if [[ "$LIGHT_TASK" == *"|"* && "$last_segment" =~ ^[0-9]+$ ]]; then if [[ -n "$n_shot" && "$last_segment" != "$n_shot" ]]; then - LIGHT_TASK_ARG="${LIGHT_TASK%|*}|$n_shot" + LIGHT_TASK_ARG="${{LIGHT_TASK%|*}}|$n_shot" else LIGHT_TASK_ARG="$LIGHT_TASK" fi else - LIGHT_TASK_ARG="${LIGHT_TASK}|$n_shot" + LIGHT_TASK_ARG="${{LIGHT_TASK}}|$n_shot" fi fi @@ -146,4 +147,4 @@ do done -echo "Job $SLURM_ARRAY_TASK_ID finished." +echo "Job $SLURM_ARRAY_TASK_ID finished." \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 262103d..be55756 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,11 +6,13 @@ readme = "README.md" requires-python = ">=3.12" dependencies = [ "pandas", - "jsonargparse[all]", - "datasets<4.0.0", + "jsonargparse", + "datasets", "rich", "torch", "lm-eval", + "lighteval[extended_tasks,multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978", + "pydantic<2.12", "huggingface_hub", "pyyaml", "questionary", From f9c5bcec47da95533bde632a8052110b9a09963d Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 10:24:16 +0300 Subject: [PATCH 05/39] fix: lumi paths --- apptainer/lumi.def | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/apptainer/lumi.def b/apptainer/lumi.def index 020a0e8..a7d71d7 100644 --- a/apptainer/lumi.def +++ b/apptainer/lumi.def @@ -2,27 +2,28 @@ Bootstrap: docker From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1 %labels - Author multi-cluster-eval - Description Apptainer image for LUMI cluster (converted from dockerfile) + Author oellm-cli + Description Apptainer image for LUMI cluster %post - # 1. Install uv package manager - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile + # Install uv into a global bin + curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh - # Make uv visible for subsequent commands during build - export PATH=/root/.local/bin:$PATH + # Put uv-installed tool shims in a global bin too + export UV_TOOL_BIN_DIR=/usr/local/bin + uv --version - # 2. Install Python dependencies uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate - # Install LightEval CLI in an isolated environment + # Optional: keep tool envs under /opt to avoid $HOME + export UV_TOOL_DIR=/opt/uv-tools uv tool install "lighteval[multilingual]" - + %environment - # Ensure uv is present inside the container runtime as well - export PATH=/root/.local/bin:$PATH + export PATH=/usr/local/bin:$PATH + export UV_TOOL_BIN_DIR=/usr/local/bin + export UV_TOOL_DIR=/opt/uv-tools %runscript exec bash "$@" \ No newline at end of file From 64287d4c02235071109a516d57276b494eceb5aa Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 13:48:05 +0300 Subject: [PATCH 06/39] fix: faster compression --- .github/workflows/build-and-push-apptainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml index b066e28..db5ed7c 100644 --- a/.github/workflows/build-and-push-apptainer.yml +++ b/.github/workflows/build-and-push-apptainer.yml @@ -37,7 +37,7 @@ jobs: - name: Build SIF from definition file run: | - apptainer --verbose build --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def + apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 22" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def - name: Install Hugging Face Hub CLI run: pip install --upgrade "huggingface_hub" From 2674439d7820cfc581792528cffbcb337e1bfec6 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 14:03:21 +0300 Subject: [PATCH 07/39] fix: faster compression --- .github/workflows/build-and-push-apptainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml index db5ed7c..197816e 100644 --- a/.github/workflows/build-and-push-apptainer.yml +++ b/.github/workflows/build-and-push-apptainer.yml @@ -37,7 +37,7 @@ jobs: - name: Build SIF from definition file run: | - apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 22" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def + apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def - name: Install Hugging Face Hub CLI run: pip install --upgrade "huggingface_hub" From 10d42173ad78e3c6a808a25050faef3b1ebf3d31 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 15:44:49 +0300 Subject: [PATCH 08/39] chore: remove unnecessary files --- oellm/light_eval_benchmarks/flores-200-eu.txt | 44 ------------------- 1 file changed, 44 deletions(-) delete mode 100644 oellm/light_eval_benchmarks/flores-200-eu.txt diff --git a/oellm/light_eval_benchmarks/flores-200-eu.txt b/oellm/light_eval_benchmarks/flores-200-eu.txt deleted file mode 100644 index 414ad1d..0000000 --- a/oellm/light_eval_benchmarks/flores-200-eu.txt +++ /dev/null @@ -1,44 +0,0 @@ -flores200:bul_Cyrl-eng_Latn|0 -flores200:ces_Latn-eng_Latn|0 -flores200:dan_Latn-eng_Latn|0 -flores200:deu_Latn-eng_Latn|0 -flores200:ell_Grek-eng_Latn|0 -flores200:eng_Latn-bul_Cyrl|0 -flores200:eng_Latn-ces_Latn|0 -flores200:eng_Latn-dan_Latn|0 -flores200:eng_Latn-deu_Latn|0 -flores200:eng_Latn-ell_Grek|0 -flores200:eng_Latn-est_Latn|0 -flores200:eng_Latn-fin_Latn|0 -flores200:eng_Latn-fra_Latn|0 -flores200:eng_Latn-hrv_Latn|0 -flores200:eng_Latn-hun_Latn|0 -flores200:eng_Latn-ita_Latn|0 -flores200:eng_Latn-lit_Latn|0 -flores200:eng_Latn-lvs_Latn|0 -flores200:eng_Latn-mlt_Latn|0 -flores200:eng_Latn-nld_Latn|0 -flores200:eng_Latn-pol_Latn|0 -flores200:eng_Latn-por_Latn|0 -flores200:eng_Latn-ron_Latn|0 -flores200:eng_Latn-slk_Latn|0 -flores200:eng_Latn-slv_Latn|0 -flores200:eng_Latn-spa_Latn|0 -flores200:eng_Latn-swe_Latn|0 -flores200:est_Latn-eng_Latn|0 -flores200:fin_Latn-eng_Latn|0 -flores200:fra_Latn-eng_Latn|0 -flores200:hrv_Latn-eng_Latn|0 -flores200:hun_Latn-eng_Latn|0 -flores200:ita_Latn-eng_Latn|0 -flores200:lit_Latn-eng_Latn|0 -flores200:lvs_Latn-eng_Latn|0 -flores200:mlt_Latn-eng_Latn|0 -flores200:nld_Latn-eng_Latn|0 -flores200:pol_Latn-eng_Latn|0 -flores200:por_Latn-eng_Latn|0 -flores200:ron_Latn-eng_Latn|0 -flores200:slk_Latn-eng_Latn|0 -flores200:slv_Latn-eng_Latn|0 -flores200:spa_Latn-eng_Latn|0 -flores200:swe_Latn-eng_Latn|0 From e2c866ac002cc30d6e7b97103997df7cb407cbc1 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 15:45:21 +0300 Subject: [PATCH 09/39] fix: ruff formatting target version --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index be55756..2b7a64a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,8 +44,8 @@ url = "https://download.pytorch.org/whl/cpu" explicit = true [tool.ruff] -line-length = 88 -target-version = "py38" +line-length = 90 +target-version = "py312" [tool.ruff.lint] select = [ From 20f04e963a2ab50afe6971a2ea3c5f5f0265b347 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 15:50:44 +0300 Subject: [PATCH 10/39] chore: restructure task-groups into groups and super-groups --- oellm/task-groups.yaml | 156 +++++++++++++++++------------------------ 1 file changed, 63 insertions(+), 93 deletions(-) diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml index e108497..8f91679 100644 --- a/oellm/task-groups.yaml +++ b/oellm/task-groups.yaml @@ -5,6 +5,7 @@ task_groups: open-sci-0.01: description: "open-sci-ref 0.01 evals" + suite: lm-eval-harness tasks: - task: copa n_shots: [0] @@ -32,147 +33,116 @@ task_groups: n_shots: [10] belebele-eu-5-shot: description: "Belebele European language tasks" + suite: lm-eval-harness + n_shots: [5] tasks: - task: belebele_bul_Cyrl - n_shots: [5] - task: belebele_hrv_Latn - n_shots: [5] - task: belebele_ces_Latn - n_shots: [5] - task: belebele_dan_Latn - n_shots: [5] - task: belebele_nld_Latn - n_shots: [5] - task: belebele_eng_Latn - n_shots: [5] - task: belebele_est_Latn - n_shots: [5] - task: belebele_fin_Latn - n_shots: [5] - task: belebele_fra_Latn - n_shots: [5] - task: belebele_deu_Latn - n_shots: [5] - task: belebele_ell_Grek - n_shots: [5] - task: belebele_hun_Latn - n_shots: [5] - task: belebele_ita_Latn - n_shots: [5] - task: belebele_lvs_Latn - n_shots: [5] - task: belebele_lit_Latn - n_shots: [5] - task: belebele_mlt_Latn - n_shots: [5] - task: belebele_pol_Latn - n_shots: [5] - task: belebele_por_Latn - n_shots: [5] - task: belebele_ron_Latn - n_shots: [5] - task: belebele_slk_Latn - n_shots: [5] - task: belebele_slv_Latn - n_shots: [5] - task: belebele_spa_Latn - n_shots: [5] - task: belebele_swe_Latn - n_shots: [5] - flores-200-eu: - description: "Flores 200 EU tasks" + flores-200-eu-to-eng: + description: "Flores 200 EU to English translation" suite: lighteval + n_shots: [0] tasks: - task: flores200:bul_Cyrl-eng_Latn - n_shots: [0] - - task: flores200:ces_Latn-eng_Latn - n_shots: [0] - - task: flores200:dan_Latn-eng_Latn - n_shots: [0] - - task: flores200:deu_Latn-eng_Latn - n_shots: [0] - - task: flores200:ell_Grek-eng_Latn - n_shots: [0] - - task: flores200:est_Latn-eng_Latn - n_shots: [0] - - task: flores200:fin_Latn-eng_Latn - n_shots: [0] - - task: flores200:fra_Latn-eng_Latn - n_shots: [0] - - task: flores200:gle_Latn-eng_Latn - n_shots: [0] - - task: flores200:hrv_Latn-eng_Latn - n_shots: [0] - - task: flores200:hun_Latn-eng_Latn - n_shots: [0] - - task: flores200:ita_Latn-eng_Latn - n_shots: [0] - - task: flores200:lit_Latn-eng_Latn - n_shots: [0] - - task: flores200:lvs_Latn-eng_Latn - n_shots: [0] - - task: flores200:mlt_Latn-eng_Latn - n_shots: [0] - - task: flores200:nld_Latn-eng_Latn - n_shots: [0] - - task: flores200:pol_Latn-eng_Latn - n_shots: [0] - - task: flores200:por_Latn-eng_Latn - n_shots: [0] - - task: flores200:ron_Latn-eng_Latn - n_shots: [0] - - task: flores200:slk_Latn-eng_Latn - n_shots: [0] - - task: flores200:slv_Latn-eng_Latn - n_shots: [0] - - task: flores200:spa_Latn-eng_Latn - n_shots: [0] - - task: flores200:swe_Latn-eng_Latn - n_shots: [0] + # - task: flores200:ces_Latn-eng_Latn + # - task: flores200:dan_Latn-eng_Latn + # - task: flores200:deu_Latn-eng_Latn + # - task: flores200:ell_Grek-eng_Latn + # - task: flores200:est_Latn-eng_Latn + # - task: flores200:fin_Latn-eng_Latn + # - task: flores200:fra_Latn-eng_Latn + # - task: flores200:gle_Latn-eng_Latn + # - task: flores200:hrv_Latn-eng_Latn + # - task: flores200:hun_Latn-eng_Latn + # - task: flores200:ita_Latn-eng_Latn + # - task: flores200:lit_Latn-eng_Latn + # - task: flores200:lvs_Latn-eng_Latn + # - task: flores200:mlt_Latn-eng_Latn + # - task: flores200:nld_Latn-eng_Latn + # - task: flores200:pol_Latn-eng_Latn + # - task: flores200:por_Latn-eng_Latn + # - task: flores200:ron_Latn-eng_Latn + # - task: flores200:slk_Latn-eng_Latn + # - task: flores200:slv_Latn-eng_Latn + # - task: flores200:spa_Latn-eng_Latn + # - task: flores200:swe_Latn-eng_Latn + flores-200-eng-to-eu: + description: "Flores 200 English to EU translation" + suite: lighteval + n_shots: [0] + tasks: + - task: flores200:eng_Latn-bul_Cyrl + - task: flores200:eng_Latn-ces_Latn + - task: flores200:eng_Latn-dan_Latn + - task: flores200:eng_Latn-deu_Latn + - task: flores200:eng_Latn-ell_Grek + - task: flores200:eng_Latn-est_Latn + - task: flores200:eng_Latn-fin_Latn + - task: flores200:eng_Latn-fra_Latn + - task: flores200:eng_Latn-gle_Latn + - task: flores200:eng_Latn-hrv_Latn + - task: flores200:eng_Latn-hun_Latn + - task: flores200:eng_Latn-ita_Latn + - task: flores200:eng_Latn-lit_Latn + - task: flores200:eng_Latn-lvs_Latn + - task: flores200:eng_Latn-mlt_Latn + - task: flores200:eng_Latn-nld_Latn + - task: flores200:eng_Latn-pol_Latn + - task: flores200:eng_Latn-por_Latn + - task: flores200:eng_Latn-ron_Latn + - task: flores200:eng_Latn-slk_Latn + - task: flores200:eng_Latn-slv_Latn + - task: flores200:eng_Latn-spa_Latn + - task: flores200:eng_Latn-swe_Latn global-mmlu-eu: description: "Global MMLU EU tasks" + suite: lm-eval-harness + n_shots: [5] tasks: - task: global_mmlu_full_cs - n_shots: [5] - task: global_mmlu_full_de - n_shots: [5] - task: global_mmlu_full_el - n_shots: [5] - task: global_mmlu_full_en - n_shots: [5] - task: global_mmlu_full_es - n_shots: [5] - task: global_mmlu_full_fr - n_shots: [5] - task: global_mmlu_full_it - n_shots: [5] - task: global_mmlu_full_lt - n_shots: [5] - task: global_mmlu_full_nl - n_shots: [5] - task: global_mmlu_full_pl - n_shots: [5] - task: global_mmlu_full_pt - n_shots: [5] - task: global_mmlu_full_ro - n_shots: [5] - task: global_mmlu_full_ru - n_shots: [5] - task: global_mmlu_full_sr - n_shots: [5] - task: global_mmlu_full_sv - n_shots: [5] - task: global_mmlu_full_tr - n_shots: [5] - task: global_mmlu_full_uk - n_shots: [5] - task: global_mmlu_full_he - n_shots: [5] + +super_groups: oellm-multilingual: description: "Combined Belebele EU set plus multilingual benchmarks" - tasks: + task_groups: # - task: belebele-eu-5-shot - # suite: lm_eval - - task: flores-200-eu - # - task: global-mmlu-eu - # suite: lm_eval + - task: flores-200-eu-to-eng + # - task: global-mmlu-eu \ No newline at end of file From 73e23772b8824f1e8b60bcae8c71eeb25c1bf3ea Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Mon, 20 Oct 2025 15:51:14 +0300 Subject: [PATCH 11/39] feat: task-cache prototype --- oellm/task_cache.py | 54 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 oellm/task_cache.py diff --git a/oellm/task_cache.py b/oellm/task_cache.py new file mode 100644 index 0000000..49f7f85 --- /dev/null +++ b/oellm/task_cache.py @@ -0,0 +1,54 @@ +import json +from datetime import datetime +from pathlib import Path + + +TASK_CACHE_TTL_DAYS = 30 + + +def get_task_cache_file() -> Path: + return Path(__file__).resolve().parent / "task_map_cache.json" + + +def load_task_cache() -> dict: + cache_file = get_task_cache_file() + if not cache_file.exists(): + return {} + with open(cache_file, "r") as f: + return json.load(f) or {} + + +def save_task_cache(cache: dict) -> None: + cache_file = get_task_cache_file() + with open(cache_file, "w") as f: + json.dump(cache, f, indent=2, sort_keys=True) + + +def task_cache_key(framework: str, task_id: str) -> str: + return f"{framework}::{task_id}" + + +def task_cache_is_fresh(entry: dict, ttl_days: int = TASK_CACHE_TTL_DAYS) -> bool: + ts = float(entry.get("ts", 0)) + age_days = (datetime.now().timestamp() - ts) / 86400.0 + return age_days >= 0 and age_days < float(ttl_days) + + +def task_cache_lookup( + framework: str, task_id: str, ttl_days: int = TASK_CACHE_TTL_DAYS +) -> bool: + cache = load_task_cache() + key = task_cache_key(framework, task_id) + entry = cache.get(key) + if not isinstance(entry, dict): + return False + return task_cache_is_fresh(entry, ttl_days) + + +def task_cache_mark_resolved(framework: str, task_id: str) -> None: + cache = load_task_cache() + key = task_cache_key(framework, task_id) + cache[key] = {"ts": datetime.now().timestamp()} + save_task_cache(cache) + + From f831fbc7b0ed9107b7f8df7ec9c86b581958affe Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 11:40:51 +0300 Subject: [PATCH 12/39] feat: task super groups --- .gitignore | 1 + oellm/task-groups.yaml | 53 ++++++++++++++++++++---------------------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 77fc697..9e29ad2 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ **/*.egg-info **/*.csv **/uv.lock +**/task_map_cache.json \ No newline at end of file diff --git a/oellm/task-groups.yaml b/oellm/task-groups.yaml index 8f91679..e69de26 100644 --- a/oellm/task-groups.yaml +++ b/oellm/task-groups.yaml @@ -1,7 +1,3 @@ -# Default task groups for interactive CSV builder -# Each group contains a list of tasks with their n_shot values -# Format: task_name,n_shot1,n_shot2,... - task_groups: open-sci-0.01: description: "open-sci-ref 0.01 evals" @@ -65,28 +61,28 @@ task_groups: n_shots: [0] tasks: - task: flores200:bul_Cyrl-eng_Latn - # - task: flores200:ces_Latn-eng_Latn - # - task: flores200:dan_Latn-eng_Latn - # - task: flores200:deu_Latn-eng_Latn - # - task: flores200:ell_Grek-eng_Latn - # - task: flores200:est_Latn-eng_Latn - # - task: flores200:fin_Latn-eng_Latn - # - task: flores200:fra_Latn-eng_Latn - # - task: flores200:gle_Latn-eng_Latn - # - task: flores200:hrv_Latn-eng_Latn - # - task: flores200:hun_Latn-eng_Latn - # - task: flores200:ita_Latn-eng_Latn - # - task: flores200:lit_Latn-eng_Latn - # - task: flores200:lvs_Latn-eng_Latn - # - task: flores200:mlt_Latn-eng_Latn - # - task: flores200:nld_Latn-eng_Latn - # - task: flores200:pol_Latn-eng_Latn - # - task: flores200:por_Latn-eng_Latn - # - task: flores200:ron_Latn-eng_Latn - # - task: flores200:slk_Latn-eng_Latn - # - task: flores200:slv_Latn-eng_Latn - # - task: flores200:spa_Latn-eng_Latn - # - task: flores200:swe_Latn-eng_Latn + - task: flores200:ces_Latn-eng_Latn + - task: flores200:dan_Latn-eng_Latn + - task: flores200:deu_Latn-eng_Latn + - task: flores200:ell_Grek-eng_Latn + - task: flores200:est_Latn-eng_Latn + - task: flores200:fin_Latn-eng_Latn + - task: flores200:fra_Latn-eng_Latn + - task: flores200:gle_Latn-eng_Latn + - task: flores200:hrv_Latn-eng_Latn + - task: flores200:hun_Latn-eng_Latn + - task: flores200:ita_Latn-eng_Latn + - task: flores200:lit_Latn-eng_Latn + - task: flores200:lvs_Latn-eng_Latn + - task: flores200:mlt_Latn-eng_Latn + - task: flores200:nld_Latn-eng_Latn + - task: flores200:pol_Latn-eng_Latn + - task: flores200:por_Latn-eng_Latn + - task: flores200:ron_Latn-eng_Latn + - task: flores200:slk_Latn-eng_Latn + - task: flores200:slv_Latn-eng_Latn + - task: flores200:spa_Latn-eng_Latn + - task: flores200:swe_Latn-eng_Latn flores-200-eng-to-eu: description: "Flores 200 English to EU translation" suite: lighteval @@ -143,6 +139,7 @@ super_groups: oellm-multilingual: description: "Combined Belebele EU set plus multilingual benchmarks" task_groups: - # - task: belebele-eu-5-shot - task: flores-200-eu-to-eng - # - task: global-mmlu-eu \ No newline at end of file + - task: flores-200-eng-to-eu + - task: belebele-eu-5-shot + - task: global-mmlu-eu \ No newline at end of file From 5fe62ee69554511df16a0efc6bf9e4f38d0e81af Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 11:41:29 +0300 Subject: [PATCH 13/39] task cache fix --- oellm/task_cache.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/oellm/task_cache.py b/oellm/task_cache.py index 49f7f85..e457e77 100644 --- a/oellm/task_cache.py +++ b/oellm/task_cache.py @@ -2,7 +2,6 @@ from datetime import datetime from pathlib import Path - TASK_CACHE_TTL_DAYS = 30 @@ -50,5 +49,3 @@ def task_cache_mark_resolved(framework: str, task_id: str) -> None: key = task_cache_key(framework, task_id) cache[key] = {"ts": datetime.now().timestamp()} save_task_cache(cache) - - From e816bfdff030de8ba7bc9c604eb746749693de79 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 22:32:12 +0300 Subject: [PATCH 14/39] fix: task cache; moving data files to oellm/resources --- oellm/interactive_csv_builder.py | 16 +- oellm/main.py | 581 ++----------------------- oellm/resources/__init__.py | 0 oellm/{ => resources}/clusters.yaml | 3 +- oellm/{ => resources}/task-groups.yaml | 8 +- oellm/{ => resources}/template.sbatch | 2 +- oellm/task_cache.py | 285 +++++++++++- oellm/task_groups.py | 131 ++++++ oellm/utils.py | 480 ++++++++++++++++++++ pyproject.toml | 13 +- 10 files changed, 941 insertions(+), 578 deletions(-) create mode 100644 oellm/resources/__init__.py rename oellm/{ => resources}/clusters.yaml (95%) rename oellm/{ => resources}/task-groups.yaml (96%) rename oellm/{ => resources}/template.sbatch (99%) create mode 100644 oellm/task_groups.py create mode 100644 oellm/utils.py diff --git a/oellm/interactive_csv_builder.py b/oellm/interactive_csv_builder.py index 9b918a2..61c99f1 100644 --- a/oellm/interactive_csv_builder.py +++ b/oellm/interactive_csv_builder.py @@ -1,5 +1,6 @@ import signal import sys +from importlib.resources import files from pathlib import Path import pandas as pd @@ -118,16 +119,13 @@ def signal_handler(sig, frame): task_configs: list[tuple[str, list[int], str]] = [] add_more = True - # Load task groups from YAML file - task_groups_file = Path(__file__).parent / "task-groups.yaml" + # Load task groups from packaged resources task_groups = {} - if task_groups_file.exists(): - try: - with open(task_groups_file) as f: - data = yaml.safe_load(f) - task_groups = data.get("task_groups", {}) - except Exception as e: - console.print(f"[yellow]Warning: Could not load task groups: {e}[/yellow]") + try: + data = yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) + task_groups = data.get("task_groups", {}) + except Exception as e: + console.print(f"[yellow]Warning: Could not load task groups: {e}[/yellow]") while add_more: choices = [ diff --git a/oellm/main.py b/oellm/main.py index 72679e8..e04d87d 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -2,504 +2,32 @@ import os import re import shutil -import socket import subprocess from datetime import datetime -from itertools import product +from importlib.resources import files from pathlib import Path from string import Template -from typing import Iterable import numpy as np import pandas as pd -import yaml from jsonargparse import auto_cli -from rich import box -from rich.console import Console -from rich.logging import RichHandler - - -def _ensure_singularity_image(image_name: str) -> None: - # TODO: switch to OELLM dataset repo once it is created - from huggingface_hub import hf_hub_download - - hf_repo = os.environ.get("HF_SIF_REPO", "timurcarstensen/testing") - image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name - - try: - hf_hub_download( - repo_id=hf_repo, - filename=image_name, - repo_type="dataset", - local_dir=os.getenv("EVAL_BASE_DIR"), - ) - logging.info( - "Successfully downloaded latest Singularity image from HuggingFace" - ) - except Exception as e: - logging.warning( - "Failed to fetch latest container image from HuggingFace: %s", str(e) - ) - if image_path.exists(): - logging.info("Using existing Singularity image at %s", image_path) - else: - raise RuntimeError( - f"No container image found at {image_path} and failed to download from HuggingFace. " - f"Cannot proceed with evaluation scheduling." - ) from e - - logging.info( - "Singularity image ready at %s", - Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"), - ) - - -def _setup_logging(verbose: bool = False): - rich_handler = RichHandler( - console=Console(), - show_time=True, - log_time_format="%H:%M:%S", - show_path=False, - markup=True, - rich_tracebacks=True, - ) - - class RichFormatter(logging.Formatter): - def format(self, record): - # Define colors for different log levels - record.msg = f"{record.getMessage()}" - return record.msg - - rich_handler.setFormatter(RichFormatter()) - - root_logger = logging.getLogger() - root_logger.handlers = [] # Remove any default handlers - root_logger.addHandler(rich_handler) - root_logger.setLevel(logging.DEBUG if verbose else logging.INFO) - - -def _load_cluster_env() -> None: - """ - Loads the correct cluster environment variables from `clusters.yaml` based on the hostname. - """ - with open(Path(__file__).parent / "clusters.yaml") as f: - clusters = yaml.safe_load(f) - hostname = socket.gethostname() - - # First load shared environment variables - shared_cfg = clusters.get("shared", {}) - - # match hostname to the regex in the clusters.yaml - for host in set(clusters.keys()) - {"shared"}: - pattern = clusters[host]["hostname_pattern"] - # Convert shell-style wildcards to regex - regex_pattern = pattern.replace(".", r"\.").replace("*", ".*") - if re.match(f"^{regex_pattern}$", hostname): - cluster_cfg = clusters[host] - break - else: - raise ValueError(f"No cluster found for hostname: {hostname}") - - # Combine shared and cluster-specific configs, with cluster-specific taking precedence - # Remove hostname_pattern from the final config - if "hostname_pattern" in cluster_cfg: - del cluster_cfg["hostname_pattern"] - - # Set environment variables, expanding any template variables - for k, v in cluster_cfg.items(): - # Expand template variables using existing environment variables - os.environ[k] = str(v) - - for k, v in shared_cfg.items(): - try: - os.environ[k] = str(v).format(**cluster_cfg) - except KeyError as e: - # when substituting env vars that are not in cluster_cfg but in the environment (e.g., $USER, $SHELL, etc...) - if len(e.args) > 1: - raise ValueError( - f"Env. variable substitution for {k} failed. Missing keys: {', '.join(e.args)}" - ) from e - - missing_key: str = e.args[0] - os.environ[k] = str(v).format( - **cluster_cfg, **{missing_key: os.environ[missing_key]} - ) - - -def _num_jobs_in_queue() -> int: - # TODO avoid running in shell mode which is not secure - result = subprocess.run( - "squeue -u $USER -h -t pending,running -r | wc -l", - shell=True, - capture_output=True, - text=True, - ) - - if result.stdout: - try: - return int(result.stdout.strip()) - except ValueError: - logging.warning(f"Could not parse squeue output: {result.stdout}") - return 0 - - if result.stderr: - logging.warning(f"squeue command produced an error: {result.stderr.strip()}") - - return 0 - - -def _expand_local_model_paths(model: str) -> list[Path]: - """ - Expands a local model path to include all checkpoints if it's a directory. - Recursively searches for models in subdirectories. - - Args: - model: Path to a model or directory containing models - - Returns: - List of paths to model directories containing safetensors files - """ - model_paths = [] - model_path = Path(model) - - if not model_path.exists() or not model_path.is_dir(): - return model_paths - - # First check if current directory contains safetensors files - if any(model_path.glob("*.safetensors")): - model_paths.append(model_path) - # If current dir has safetensors, don't recurse further - return model_paths - - # Check for hf subdirectory pattern (single model with checkpoints) - hf_path = model_path / "hf" - if hf_path.exists() and hf_path.is_dir(): - # This is a single model with checkpoints in hf/iter_* structure - for subdir in hf_path.glob("*"): - if subdir.is_dir() and any(subdir.glob("*.safetensors")): - model_paths.append(subdir) - if model_paths: - return model_paths - - # Check if subdirectories look like model directories - # (e.g., open-sci-ref_model-0.13b_data-c4_...) - subdirs = [d for d in model_path.iterdir() if d.is_dir()] - - # Process each subdirectory as a potential model - for subdir in subdirs: - # Check if this subdirectory directly contains safetensors - if any(subdir.glob("*.safetensors")): - model_paths.append(subdir) - else: - # Check for hf/iter_* pattern in this subdirectory - hf_subpath = subdir / "hf" - if hf_subpath.exists() and hf_subpath.is_dir(): - for checkpoint_dir in hf_subpath.glob("*"): - if checkpoint_dir.is_dir() and any( - checkpoint_dir.glob("*.safetensors") - ): - model_paths.append(checkpoint_dir) - - if len(model_paths) > 1: - logging.info(f"Expanded '{model}' to {len(model_paths)} model checkpoints") - - return model_paths - - -def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: - """ - Processes model strings into a dict of model paths. - - Each model string can be a local path or a huggingface model identifier. - This function expands directory paths that contain multiple checkpoints. - """ - from huggingface_hub import snapshot_download - - processed_model_paths = {} - model_paths = [] - for model in models: - # First try to expand local paths - local_paths = _expand_local_model_paths(model) - if local_paths: - model_paths.extend(local_paths) - else: - logging.info( - f"Model {model} not found locally, assuming it is a πŸ€— hub model" - ) - logging.debug( - f"Downloading model {model} on the login node since the compute nodes may not have access to the internet" - ) - - if "," in model: - model_kwargs = dict( - [kv.split("=") for kv in model.split(",") if "=" in kv] - ) - - # The first element before the comma is the repository ID on the πŸ€— Hub - repo_id = model.split(",")[0] - - # snapshot_download kwargs - snapshot_kwargs = {} - if "revision" in model_kwargs: - snapshot_kwargs["revision"] = model_kwargs["revision"] - - try: - # Pre-download (or reuse cache) for the whole repository so that - # compute nodes can load it offline. - snapshot_download( - repo_id=repo_id, - cache_dir=Path(os.getenv("HF_HOME")) / "hub", - **snapshot_kwargs, - ) - model_paths.append(model) - except Exception as e: - logging.debug( - f"Failed to download model {model} from Hugging Face Hub. Continuing..." - ) - logging.debug(e) - else: - # Download the entire model repository to the local cache. The - # original identifier is kept in *model_paths* so downstream - # code can still reference it; at runtime the files will be - # read from cache, allowing offline execution. - snapshot_download( - repo_id=model, - cache_dir=Path(os.getenv("HF_HOME")) / "hub", - ) - model_paths.append(model) - - if not model_paths: - logging.warning( - f"Could not find any valid model for '{model}'. It will be skipped." - ) - processed_model_paths[model] = model_paths - return processed_model_paths - - -def _count_task_subtasks(task_name: str, task_manager) -> int: - from lm_eval.evaluator_utils import get_subtask_list # type: ignore - - task_objects = task_manager.load_task_or_group(task_name) - subtask_dict = get_subtask_list(task_objects) - - total_subtasks = 0 - for _, subtask_list in subtask_dict.items(): - total_subtasks += len(subtask_list) - - return max(1, total_subtasks) # At least 1 subtask - - -def _calculate_task_minutes( - task_name: str, task_manager, base_minutes_per_subtask: int = 5 -) -> int: - """Calculate estimated minutes for a task based on its subtask count.""" - subtask_count = _count_task_subtasks(task_name, task_manager) - - # Special handling for known multi-language tasks that take longer per subtask - known_complex_tasks = { - "belebele": 8, # Multi-language reading comprehension, slower per subtask - "flores": 6, # Translation task, moderately complex - "xnli": 6, # Cross-lingual NLI - "xcopa": 6, # Cross-lingual COPA - "xstory_cloze": 6, # Cross-lingual story cloze - "paws-x": 6, # Cross-lingual paraphrase detection - "hellaswag": 20, # Hellaswag task, needs 20 minutes per subtask - } - - # Use task-specific timing if available, otherwise use default - minutes_per_subtask = known_complex_tasks.get( - task_name.lower(), base_minutes_per_subtask - ) - - # Calculate total time: (subtasks Γ— time_per_subtask) + base_overhead - base_overhead = 3 # Base overhead for task setup/teardown - total_minutes = max(10, (subtask_count * minutes_per_subtask) + base_overhead) - - # Log for complex tasks (>5 subtasks) or any known complex task - if subtask_count > 5 or task_name.lower() in known_complex_tasks: - complexity_note = ( - f" (known complex task, {minutes_per_subtask} min/subtask)" - if task_name.lower() in known_complex_tasks - else "" - ) - logging.info( - f"πŸ“Š Task '{task_name}' has {subtask_count} subtasks{complexity_note}, " - f"estimated time: {total_minutes} minutes ({total_minutes / 60:.1f} hours)" - ) - - return total_minutes - - -def _pre_download_task_datasets( - tasks: Iterable[str], trust_remote_code: bool = True -) -> None: - """Ensure that all datasets required by the given `tasks` are present in the local πŸ€— cache at $HF_HOME.""" - - from datasets import DownloadMode # type: ignore - from lm_eval.tasks import TaskManager # type: ignore - - processed: set[str] = set() - - tm = TaskManager() - - for task_name in tasks: - if not isinstance(task_name, str) or task_name in processed: - continue - processed.add(task_name) - - logging.info( - f"Preparing dataset for task '{task_name}' (download if not cached)…" - ) - - # Instantiating the task downloads the dataset (or reuses cache) - - task_config = { - "task": task_name, - "dataset_kwargs": {"trust_remote_code": trust_remote_code}, - } - - task_objects = tm.load_config(task_config) - - # Some entries might be nested dictionaries (e.g., groups) - stack = [task_objects] - while stack: - current = stack.pop() - if isinstance(current, dict): - stack.extend(current.values()) - continue - if hasattr(current, "download") and callable(current.download): - try: - current.download(download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS) # type: ignore[arg-type] - except TypeError as e: - logging.error( - f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}" - ) - current.download() # type: ignore[misc] - - logging.debug(f"Finished dataset preparation for task '{task_name}'.") - - -def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: - """Pre-download LightEval datasets by instantiating tasks via the local LightEval Registry.""" - import sys - - local_le_src = Path(__file__).parent.parent / "lighteval" / "src" - if local_le_src.exists(): - sys.path.insert(0, str(local_le_src)) - - from lighteval.tasks.registry import Registry, TRUNCATE_FEW_SHOTS_DEFAULTS # type: ignore - from lighteval.tasks.lighteval_task import LightevalTask # type: ignore - - file_task_specs: list[str] = [] - string_task_specs: list[str] = [] - - for t in tasks: - raw = str(t).strip() - if not raw: - continue - candidate = Path(raw) - if candidate.exists() and candidate.is_file(): - file_task_specs.append(str(candidate)) - else: - spec = raw - truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) - if "|" not in spec: - spec = f"lighteval|{spec}|0|{truncate_default}" - elif spec.count("|") == 1: - spec = f"{spec}|0|{truncate_default}" - elif spec.count("|") == 2: - spec = f"{spec}|{truncate_default}" - string_task_specs.append(spec) - - unique_string_specs = sorted(set(string_task_specs)) - unique_file_specs = sorted(set(file_task_specs)) - - if unique_string_specs: - reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") - configs = reg.get_tasks_configs(",".join(unique_string_specs)) - task_dict = reg.get_tasks_from_configs(configs) - LightevalTask.load_datasets(task_dict) - - for fp in unique_file_specs: - reg_file = Registry() - configs_file = reg_file.get_tasks_configs(fp) - task_dict_file = reg_file.get_tasks_from_configs(configs_file) - LightevalTask.load_datasets(task_dict_file) - -def _load_task_groups() -> dict[str, dict]: - """Load task groups from `task-groups.yaml` located next to this module.""" - groups_file = Path(__file__).parent / "task-groups.yaml" - if not groups_file.exists(): - raise ValueError(f"Task groups file not found: {groups_file}") - - with open(groups_file) as f: - data = yaml.safe_load(f) or {} - - groups = data.get("task_groups") or {} - if not isinstance(groups, dict): - raise ValueError("Invalid task groups format in task-groups.yaml") - - return groups - - -def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]: - """ - Expand task group names into concrete (task, n_shots, suite) tuples. - - Supports nested groups. Defaults: suite=lm_eval, n_shots=[0] when absent. - A group's `suite` (if present) is inherited by its items and nested groups - unless a leaf explicitly overrides it. - """ - groups = _load_task_groups() - resolved: list[tuple[str, list[int], str]] = [] - - def expand_group(group_name: str, stack: set[str], inherited_suite: str | None = None) -> None: - if group_name not in groups: - raise ValueError(f"Unknown task group: {group_name}") - if group_name in stack: - raise ValueError(f"Cyclic task group reference detected at '{group_name}'") - - stack.add(group_name) - group_default_suite = groups[group_name].get("suite") - effective_inherited_suite = inherited_suite if inherited_suite is not None else group_default_suite - - for item in groups[group_name].get("tasks", []): - task_identifier = str(item.get("task")) - # Prefer explicit suite on the item; otherwise inherit; otherwise default to lm_eval - item_suite = item.get("suite") - suite_name = ( - str(item_suite) - if item_suite is not None - else (str(effective_inherited_suite) if effective_inherited_suite is not None else "lm_eval") - ) - n_shots_value = item.get("n_shots") - - # Nested group reference: propagate the resolved suite - if task_identifier in groups: - next_inherited = str(item_suite) if item_suite is not None else effective_inherited_suite - # Pass down only an inherited suite (or explicit item override) without defaulting to "lm_eval", - # so that the child group's own default `suite` can take effect if present. - expand_group(task_identifier, stack, next_inherited) - continue - - # Leaf task - if not isinstance(n_shots_value, list): - n_shots: list[int] = [0] - else: - # Ensure ints - n_shots = [int(x) for x in n_shots_value] - - resolved.append((task_identifier, n_shots, suite_name)) - stack.remove(group_name) - - for raw_name in group_names: - name = str(raw_name).strip() - if not name: - continue - expand_group(name, set(), None) - - return resolved +from oellm.task_cache import clear_task_cache +from oellm.task_groups import _expand_task_groups +from oellm.utils import ( + _ensure_singularity_image, + _expand_local_model_paths, + _load_cluster_env, + _num_jobs_in_queue, + _pre_download_lighteval_datasets, + _pre_download_task_datasets, + _process_model_paths, + _setup_logging, + capture_third_party_output_from_kwarg, +) + + +@capture_third_party_output_from_kwarg("verbose") def schedule_evals( models: str | None = None, tasks: str | None = None, @@ -557,7 +85,6 @@ def schedule_evals( else: logging.info("Skipping container image check (--skip-checks enabled)") - if eval_csv_path: if models or tasks or task_groups or n_shot: raise ValueError( @@ -610,6 +137,7 @@ def schedule_evals( logging.info( "Skipping model path processing and validation (--skip-checks enabled)" ) + elif models and ((tasks and n_shot is not None) or task_groups): model_list = [m.strip() for m in models.split(",") if m.strip()] model_paths: list[Path | str] = [] @@ -677,7 +205,9 @@ def schedule_evals( } ) - df = pd.DataFrame(rows, columns=["model_path", "task_path", "n_shot", "eval_suite"]) + df = pd.DataFrame( + rows, columns=["model_path", "task_path", "n_shot", "eval_suite"] + ) else: raise ValueError( "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`." @@ -741,71 +271,20 @@ def schedule_evals( logging.debug(f"Saved evaluation dataframe to temporary CSV: {csv_path}") - with open(Path(__file__).parent / "template.sbatch") as f: - sbatch_template = f.read() + sbatch_template = (files("oellm.resources") / "template.sbatch").read_text() # Calculate dynamic array size and time limits total_evals = len(df) - # Calculate time based on actual task complexity (subtask count) - if not skip_checks: - from lm_eval.tasks import TaskManager # type: ignore - - shared_task_manager = TaskManager() - - # Calculate total minutes by considering each unique task's complexity - total_minutes = 0 - task_time_cache = {} # Cache to avoid recalculating for same tasks - - lm_eval_mask = df["eval_suite"].str.lower().isin( - {"lm_eval", "lm-eval", "lm-eval-harness"} - ) - light_eval_mask = df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) - - for _, row in df[lm_eval_mask].iterrows(): - task_name = row["task_path"] - if task_name not in task_time_cache: - task_time_cache[task_name] = _calculate_task_minutes( - task_name, task_manager=shared_task_manager - ) - total_minutes += task_time_cache[task_name] - - if light_eval_mask.any(): - # LightEval benchmarks can be large; budget 15 minutes per evaluation - light_eval_minutes = int(light_eval_mask.sum() * 15) - total_minutes += light_eval_minutes - logging.info( - "Estimated LightEval time budget: %s minutes across %s evaluations", - light_eval_minutes, - light_eval_mask.sum(), - ) - - # Calculate average minutes per eval for logging purposes - minutes_per_eval = total_minutes / total_evals if total_evals > 0 else 10 - - logging.info("πŸ“Š Dynamic time calculation:") - for task_name, task_minutes in task_time_cache.items(): - task_count = ( - (df["task_path"] == task_name) - & df["eval_suite"].str.lower().isin( - {"lm_eval", "lm-eval", "lm-eval-harness"} - ) - ).sum() - logging.info( - f" Task '{task_name}': {task_minutes} min/eval Γ— {task_count} evals = {task_minutes * task_count} total minutes" - ) - else: - # Fallback to fixed timing when checks are skipped - minutes_per_eval = 10 # Budget 10 minutes per eval - total_minutes = total_evals * minutes_per_eval - logging.info( - "⚠️ Using fixed 10 min/eval (task complexity detection skipped with --skip-checks)" - ) + # fixed timing estimation + minutes_per_eval = 10 # Budget 10 minutes per eval + total_minutes = total_evals * minutes_per_eval # Copy LightEval benchmark files into evaluation directory if necessary - light_eval_paths = df[ - df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) - ]["task_path"].unique() + # TODO: why do we need this? + light_eval_paths = df[df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})][ + "task_path" + ].unique() benchmark_dir = evals_dir / "light_eval_tasks" copied_paths: dict[str, str] = {} if light_eval_paths.size > 0: @@ -1063,7 +542,6 @@ def collect_results( f"No performance metric found for {model_name} | {task_name} | n_shot={n_shot} in {json_file.name}" ) - if not rows and not check: logging.warning("No results extracted from JSON files") return @@ -1151,6 +629,7 @@ def main(): "schedule-eval": schedule_evals, "build-csv": build_csv, "collect-results": collect_results, + "clean-cache": lambda: clear_task_cache(), }, as_positional=False, description="OELLM: Multi-cluster evaluation tool for language models", diff --git a/oellm/resources/__init__.py b/oellm/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/oellm/clusters.yaml b/oellm/resources/clusters.yaml similarity index 95% rename from oellm/clusters.yaml rename to oellm/resources/clusters.yaml index 5e14325..d6da6d2 100644 --- a/oellm/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -5,6 +5,7 @@ shared: HF_HOME: "{EVAL_BASE_DIR}/hf_data" # where HuggingFace models and datasets are stored EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 + HF_HUB_DISABLE_PROGRESS_BARS: "1" leonardo: hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML @@ -31,4 +32,4 @@ lumi: ACCOUNT: "project_462000963" QUEUE_LIMIT: 210 EVAL_CONTAINER_IMAGE: "eval_env-lumi.sif" - SINGULARITY_ARGS: "--rocm" \ No newline at end of file + SINGULARITY_ARGS: "--rocm" diff --git a/oellm/task-groups.yaml b/oellm/resources/task-groups.yaml similarity index 96% rename from oellm/task-groups.yaml rename to oellm/resources/task-groups.yaml index e69de26..f9b7684 100644 --- a/oellm/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -75,7 +75,7 @@ task_groups: - task: flores200:lit_Latn-eng_Latn - task: flores200:lvs_Latn-eng_Latn - task: flores200:mlt_Latn-eng_Latn - - task: flores200:nld_Latn-eng_Latn + - task: flores200:nld_Latn-eng_Latn - task: flores200:pol_Latn-eng_Latn - task: flores200:por_Latn-eng_Latn - task: flores200:ron_Latn-eng_Latn @@ -103,7 +103,7 @@ task_groups: - task: flores200:eng_Latn-lit_Latn - task: flores200:eng_Latn-lvs_Latn - task: flores200:eng_Latn-mlt_Latn - - task: flores200:eng_Latn-nld_Latn + - task: flores200:eng_Latn-nld_Latn - task: flores200:eng_Latn-pol_Latn - task: flores200:eng_Latn-por_Latn - task: flores200:eng_Latn-ron_Latn @@ -140,6 +140,6 @@ super_groups: description: "Combined Belebele EU set plus multilingual benchmarks" task_groups: - task: flores-200-eu-to-eng - - task: flores-200-eng-to-eu + # - task: flores-200-eng-to-eu - task: belebele-eu-5-shot - - task: global-mmlu-eu \ No newline at end of file + # - task: global-mmlu-eu diff --git a/oellm/template.sbatch b/oellm/resources/template.sbatch similarity index 99% rename from oellm/template.sbatch rename to oellm/resources/template.sbatch index f4b4905..b68d637 100644 --- a/oellm/template.sbatch +++ b/oellm/resources/template.sbatch @@ -147,4 +147,4 @@ do done -echo "Job $SLURM_ARRAY_TASK_ID finished." \ No newline at end of file +echo "Job $SLURM_ARRAY_TASK_ID finished." diff --git a/oellm/task_cache.py b/oellm/task_cache.py index e457e77..d8be806 100644 --- a/oellm/task_cache.py +++ b/oellm/task_cache.py @@ -1,4 +1,6 @@ import json +import logging +from contextlib import contextmanager from datetime import datetime from pathlib import Path @@ -6,15 +8,15 @@ def get_task_cache_file() -> Path: - return Path(__file__).resolve().parent / "task_map_cache.json" + return Path(__file__).resolve().parent / "resources" / "task_map_cache.json" def load_task_cache() -> dict: cache_file = get_task_cache_file() - if not cache_file.exists(): - return {} - with open(cache_file, "r") as f: - return json.load(f) or {} + if cache_file.exists(): + with open(cache_file) as f: + return json.load(f) or {} + return {} def save_task_cache(cache: dict) -> None: @@ -23,6 +25,12 @@ def save_task_cache(cache: dict) -> None: json.dump(cache, f, indent=2, sort_keys=True) +def clear_task_cache() -> None: + cache_file = get_task_cache_file() + with open(cache_file, "w") as f: + json.dump({}, f) + + def task_cache_key(framework: str, task_id: str) -> str: return f"{framework}::{task_id}" @@ -47,5 +55,270 @@ def task_cache_lookup( def task_cache_mark_resolved(framework: str, task_id: str) -> None: cache = load_task_cache() key = task_cache_key(framework, task_id) - cache[key] = {"ts": datetime.now().timestamp()} + entry = cache.get(key) if isinstance(cache.get(key), dict) else {} + entry["ts"] = datetime.now().timestamp() + cache[key] = entry + save_task_cache(cache) + + +def task_cache_get_payload(framework: str, task_id: str) -> dict | None: + cache = load_task_cache() + key = task_cache_key(framework, task_id) + entry = cache.get(key) + if not isinstance(entry, dict): + return None + payload = entry.get("payload") + return payload if isinstance(payload, dict) else None + + +def task_cache_set_payload(framework: str, task_id: str, payload: dict) -> None: + cache = load_task_cache() + key = task_cache_key(framework, task_id) + entry: dict = cache.get(key) if isinstance(cache.get(key), dict) else {} # type: ignore[assignment] + entry["ts"] = datetime.now().timestamp() + entry["payload"] = payload + cache[key] = entry save_task_cache(cache) + + +def _canonical_key(call: dict) -> tuple: + t = call.get("type") + if t == "load_dataset": + return ( + t, + call.get("path"), + call.get("name"), + call.get("split"), + call.get("revision"), + ) + if t == "snapshot_download": + return ( + t, + call.get("repo_id"), + call.get("repo_type"), + call.get("revision"), + ) + if t == "hf_hub_download": + return ( + t, + call.get("repo_id"), + call.get("filename"), + call.get("repo_type"), + call.get("revision"), + ) + return (str(t),) + + +def dedupe_calls(calls: list[dict]) -> list[dict]: + if not isinstance(calls, list): + return [] + best: dict[tuple, dict] = {} + for c in calls: + if not isinstance(c, dict): + continue + key = _canonical_key(c) + existing = best.get(key) + if existing is None: + best[key] = c + continue + # Prefer trust_remote_code=True for load_dataset + if c.get("type") == "load_dataset": + if bool(c.get("trust_remote_code")) and not bool( + existing.get("trust_remote_code") + ): + best[key] = c + # Optionally drop snapshot_download if matching load_dataset exists + filtered: list[dict] = [] + load_keys = { + ("load_dataset", k[1], k[2], k[3], k[4]) + for k in best.keys() + if k and k[0] == "load_dataset" + } + for k, v in best.items(): + if k and k[0] == "snapshot_download": + # derive comparable key shape: (type, repo_id, None, None, revision) + comparable = ("load_dataset", k[1], None, None, k[3]) + if comparable in load_keys: + continue + filtered.append(v) + return filtered + + +@contextmanager +def capture_hf_dataset_calls(): + captured: list[dict] = [] + + import datasets as _ds # type: ignore + import huggingface_hub as _hfh # type: ignore + + _orig_load_dataset = _ds.load_dataset + _orig_snapshot_download = _hfh.snapshot_download + _orig_hf_hub_download = _hfh.hf_hub_download + + def _load_dataset_proxy(path, *args, **kwargs): # noqa: ANN001 + name = ( + kwargs.get("name") + if "name" in kwargs + else (args[0] if len(args) > 0 else None) + ) + data_files = ( + kwargs.get("data_files") + if "data_files" in kwargs + else (args[1] if len(args) > 1 else None) + ) + split = ( + kwargs.get("split") + if "split" in kwargs + else (args[2] if len(args) > 2 else None) + ) + trust_remote_code = kwargs.get("trust_remote_code") + revision = kwargs.get("revision") + captured.append( + { + "type": "load_dataset", + "path": path, + "name": name, + "data_files": data_files, + "split": split, + "revision": revision, + "trust_remote_code": trust_remote_code, + } + ) + return _orig_load_dataset(path, *args, **kwargs) + + def _snapshot_download_proxy(*args, **kwargs): # noqa: ANN001 + repo_id = ( + kwargs.get("repo_id") + if "repo_id" in kwargs + else (args[0] if len(args) > 0 else None) + ) + repo_type = ( + kwargs.get("repo_type") + if "repo_type" in kwargs + else (args[1] if len(args) > 1 else None) + ) + revision = ( + kwargs.get("revision") + if "revision" in kwargs + else (args[2] if len(args) > 2 else None) + ) + captured.append( + { + "type": "snapshot_download", + "repo_id": repo_id, + "repo_type": repo_type, + "revision": revision, + } + ) + return _orig_snapshot_download(*args, **kwargs) + + def _hf_hub_download_proxy(*args, **kwargs): # noqa: ANN001 + repo_id = ( + kwargs.get("repo_id") + if "repo_id" in kwargs + else (args[0] if len(args) > 0 else None) + ) + filename = ( + kwargs.get("filename") + if "filename" in kwargs + else (args[1] if len(args) > 1 else None) + ) + repo_type = ( + kwargs.get("repo_type") + if "repo_type" in kwargs + else (args[2] if len(args) > 2 else None) + ) + revision = ( + kwargs.get("revision") + if "revision" in kwargs + else (args[3] if len(args) > 3 else None) + ) + captured.append( + { + "type": "hf_hub_download", + "repo_id": repo_id, + "filename": filename, + "repo_type": repo_type, + "revision": revision, + } + ) + return _orig_hf_hub_download(*args, **kwargs) + + _ds.load_dataset = _load_dataset_proxy # type: ignore[assignment] + _hfh.snapshot_download = _snapshot_download_proxy # type: ignore[assignment] + _hfh.hf_hub_download = _hf_hub_download_proxy # type: ignore[assignment] + try: + yield captured + finally: + _ds.load_dataset = _orig_load_dataset # type: ignore[assignment] + _hfh.snapshot_download = _orig_snapshot_download # type: ignore[assignment] + _hfh.hf_hub_download = _orig_hf_hub_download # type: ignore[assignment] + + +def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True) -> None: + if not isinstance(payload, dict): + return + calls = payload.get("calls") + if not isinstance(calls, list): + return + + from datasets import load_dataset # type: ignore + from huggingface_hub import hf_hub_download, snapshot_download # type: ignore + + for call in calls: + if not isinstance(call, dict): + continue + # Unified prewarm log message + dataset_id = None + if call.get("type") == "load_dataset": + path = call.get("path") + name = call.get("name") + dataset_id = f"{path}{'::' + name if name else ''}" + else: + repo_id = call.get("repo_id") + filename = call.get("filename") + dataset_id = ( + f"{repo_id}{'/' + filename if filename else ''}" + if isinstance(repo_id, str) + else None + ) + if dataset_id: + logging.info(f"Prewarming dataset cache: {dataset_id}") + if call.get("type") == "snapshot_download": + repo_id = call.get("repo_id") + if isinstance(repo_id, str) and repo_id: + snapshot_download( + repo_id=repo_id, + repo_type=call.get("repo_type") or "dataset", + revision=call.get("revision"), + ) + continue + if call.get("type") == "hf_hub_download": + repo_id = call.get("repo_id") + filename = call.get("filename") + if isinstance(repo_id, str) and isinstance(filename, str): + hf_hub_download( + repo_id=repo_id, + filename=filename, + repo_type=call.get("repo_type"), + revision=call.get("revision"), + ) + continue + path = call.get("path") + name = call.get("name") + data_files = call.get("data_files") + split = call.get("split") + revision = call.get("revision") + trc = call.get("trust_remote_code", trust_remote_code) + kwargs: dict = {} + if name is not None: + kwargs["name"] = name + if data_files is not None: + kwargs["data_files"] = data_files + if revision is not None: + kwargs["revision"] = revision + kwargs["trust_remote_code"] = bool(trc) + if split is not None: + load_dataset(path, split=split, **kwargs) + else: + load_dataset(path, **kwargs) diff --git a/oellm/task_groups.py b/oellm/task_groups.py new file mode 100644 index 0000000..1419de7 --- /dev/null +++ b/oellm/task_groups.py @@ -0,0 +1,131 @@ +from collections.abc import Iterable +from dataclasses import dataclass +from importlib.resources import files + +import yaml + + +@dataclass +class _Task: + name: str + n_shots: list[int] | None = None + + +@dataclass +class TaskGroup: + name: str + tasks: list[_Task] + suite: str + description: str + n_shots: list[int] | None = None + + def __post_init__(self): + for task in self.tasks: + if task.n_shots is None and self.n_shots is not None: + task.n_shots = self.n_shots + elif task.n_shots is None and self.n_shots is None: + raise ValueError( + f"N_shots is not set for task {task.name} and no default n_shots is set for the task group: {self.name}" + ) + + @classmethod + def from_dict(cls, name: str, data: dict) -> "TaskGroup": + tasks = [] + for task_data in data["tasks"]: + task_name = task_data["task"] + task_n_shots = task_data.get("n_shots") + tasks.append(_Task(name=task_name, n_shots=task_n_shots)) + + return cls( + name=name, + tasks=tasks, + suite=data["suite"], + description=data["description"], + n_shots=data.get("n_shots"), + ) + + +@dataclass +class TaskSuperGroup: + name: str + task_groups: list[TaskGroup] + description: str + + def __post_init__(self): + resolved_groups = [] + for group in self.task_groups: + if isinstance(group, str): + raise ValueError( + f"Task group '{group}' not found in available task groups" + ) + resolved_groups.append(group) + self.task_groups = resolved_groups + + @classmethod + def from_dict( + cls, name: str, data: dict, available_task_groups: dict[str, TaskGroup] + ) -> "TaskSuperGroup": + task_groups = [] + for task_group_data in data["task_groups"]: + group_name = task_group_data["task"] + if group_name not in available_task_groups: + raise ValueError( + f"Task group '{group_name}' not found in available task groups" + ) + task_groups.append(available_task_groups[group_name]) + + return cls( + name=name, + task_groups=task_groups, + description=data["description"], + ) + + +def _parse_task_groups( + requested_groups: list[str], +) -> dict[str, TaskSuperGroup | TaskGroup]: + data = ( + yaml.safe_load((files("oellm.resources") / "task-groups.yaml").read_text()) or {} + ) + + task_groups: dict[str, TaskGroup] = {} + + for task_group_name, task_data in data["task_groups"].items(): + task_groups[task_group_name] = TaskGroup.from_dict(task_group_name, task_data) + + super_groups: dict[str, TaskSuperGroup] = {} + for super_group_name, super_group_data in data.get("super_groups", {}).items(): + super_groups[super_group_name] = TaskSuperGroup.from_dict( + super_group_name, super_group_data, task_groups + ) + + result = {**task_groups, **super_groups} + return { + group_name: group + for group_name, group in result.items() + if group_name in requested_groups + } + + +def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]: + parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) + missing = {str(n).strip() for n in group_names if str(n).strip()} - set(parsed.keys()) + if missing: + raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}") + + results: list[tuple[str, list[int], str]] = [] + + for _, group in parsed.items(): + if isinstance(group, TaskGroup): + suite = group.suite + for t in group.tasks: + shots = [int(s) for s in (t.n_shots or [])] + results.append((t.name, shots, suite)) + else: + for g in group.task_groups: + suite = g.suite + for t in g.tasks: + shots = [int(s) for s in (t.n_shots or [])] + results.append((t.name, shots, suite)) + + return results diff --git a/oellm/utils.py b/oellm/utils.py new file mode 100644 index 0000000..63927a5 --- /dev/null +++ b/oellm/utils.py @@ -0,0 +1,480 @@ +import builtins +import fnmatch +import logging +import os +import socket +import subprocess +import sys +from collections.abc import Iterable +from contextlib import contextmanager +from functools import wraps +from importlib.resources import files +from pathlib import Path + +import yaml +from rich.console import Console +from rich.logging import RichHandler + +from oellm.task_cache import ( + capture_hf_dataset_calls, + dedupe_calls, + prewarm_from_payload, + task_cache_get_payload, + task_cache_lookup, + task_cache_mark_resolved, + task_cache_set_payload, +) + + +def _ensure_singularity_image(image_name: str) -> None: + from huggingface_hub import hf_hub_download + + image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name + + try: + hf_hub_download( + repo_id="openeurollm/evaluation_singularity_images", + filename=image_name, + repo_type="dataset", + local_dir=os.getenv("EVAL_BASE_DIR"), + ) + logging.info("Successfully downloaded latest Singularity image from HuggingFace") + except Exception as e: + logging.warning( + "Failed to fetch latest container image from HuggingFace: %s", str(e) + ) + if image_path.exists(): + logging.info("Using existing Singularity image at %s", image_path) + else: + raise RuntimeError( + f"No container image found at {image_path} and failed to download from HuggingFace. " + f"Cannot proceed with evaluation scheduling." + ) from e + + logging.info( + "Singularity image ready at %s", + Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"), + ) + + +def _setup_logging(verbose: bool = False): + rich_handler = RichHandler( + console=Console(), + show_time=True, + log_time_format="%H:%M:%S", + show_path=False, + markup=True, + rich_tracebacks=True, + ) + + class RichFormatter(logging.Formatter): + def format(self, record): + record.msg = f"{record.getMessage()}" + return record.msg + + rich_handler.setFormatter(RichFormatter()) + + root_logger = logging.getLogger() + root_logger.handlers = [] + root_logger.addHandler(rich_handler) + root_logger.setLevel(logging.DEBUG if verbose else logging.INFO) + + +def _load_cluster_env() -> None: + """ + Loads the correct cluster environment variables from `clusters.yaml` based on the hostname. + """ + clusters = yaml.safe_load((files("oellm.resources") / "clusters.yaml").read_text()) + hostname = socket.gethostname() + + shared_cfg = clusters.get("shared", {}) or {} + + cluster_cfg_raw: dict | None = None + for name, cfg in clusters.items(): + if name == "shared": + continue + pattern = cfg.get("hostname_pattern") + if isinstance(pattern, str) and fnmatch.fnmatch(hostname, pattern): + cluster_cfg_raw = dict(cfg) + break + if cluster_cfg_raw is None: + raise ValueError(f"No cluster found for hostname: {hostname}") + + cluster_cfg_raw.pop("hostname_pattern", None) + + class _Default(dict): + def __missing__(self, key): + return "{" + key + "}" + + base_ctx = _Default({**os.environ, **{k: str(v) for k, v in cluster_cfg_raw.items()}}) + + resolved_shared = {k: str(v).format_map(base_ctx) for k, v in shared_cfg.items()} + + ctx = _Default({**base_ctx, **resolved_shared}) + + resolved_cluster = {k: str(v).format_map(ctx) for k, v in cluster_cfg_raw.items()} + + final_env = {**resolved_shared, **resolved_cluster} + for k, v in final_env.items(): + os.environ[k] = v + + +def _num_jobs_in_queue() -> int: + user = os.environ.get("USER") + cmd: list[str] = ["squeue"] + if user: + cmd += ["-u", user] + cmd += ["-h", "-t", "pending,running", "-r", "-o", "%i"] + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + if result.stderr: + logging.warning(f"squeue error: {result.stderr.strip()}") + return 0 + + output = result.stdout.strip() + if not output: + return 0 + return sum(1 for line in output.splitlines() if line.strip()) + + +def _expand_local_model_paths(model: str) -> list[Path]: + """ + Expands a local model path to include all checkpoints if it's a directory. + Recursively searches for models in subdirectories. + + Args: + model: Path to a model or directory containing models + + Returns: + List of paths to model directories containing safetensors files + """ + model_paths = [] + model_path = Path(model) + + if not model_path.exists() or not model_path.is_dir(): + return model_paths + + if any(model_path.glob("*.safetensors")): + model_paths.append(model_path) + return model_paths + + hf_path = model_path / "hf" + if hf_path.exists() and hf_path.is_dir(): + for subdir in hf_path.glob("*"): + if subdir.is_dir() and any(subdir.glob("*.safetensors")): + model_paths.append(subdir) + if model_paths: + return model_paths + + subdirs = [d for d in model_path.iterdir() if d.is_dir()] + + for subdir in subdirs: + if any(subdir.glob("*.safetensors")): + model_paths.append(subdir) + else: + hf_subpath = subdir / "hf" + if hf_subpath.exists() and hf_subpath.is_dir(): + for checkpoint_dir in hf_subpath.glob("*"): + if checkpoint_dir.is_dir() and any( + checkpoint_dir.glob("*.safetensors") + ): + model_paths.append(checkpoint_dir) + + if len(model_paths) > 1: + logging.info(f"Expanded '{model}' to {len(model_paths)} model checkpoints") + + return model_paths + + +def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: + """ + Processes model strings into a dict of model paths. + + Each model string can be a local path or a huggingface model identifier. + This function expands directory paths that contain multiple checkpoints. + """ + from huggingface_hub import snapshot_download + + processed_model_paths: dict[str, list[Path | str]] = {} + + for model in models: + per_model_paths: list[Path | str] = [] + + local_paths = _expand_local_model_paths(model) + if local_paths: + per_model_paths.extend(local_paths) + else: + logging.info( + f"Model {model} not found locally, assuming it is a πŸ€— hub model" + ) + logging.debug( + f"Downloading model {model} on the login node since the compute nodes may not have access to the internet" + ) + + if "," in model: + model_kwargs = dict( + [kv.split("=") for kv in model.split(",") if "=" in kv] + ) + + repo_id = model.split(",")[0] + + snapshot_kwargs = {} + if "revision" in model_kwargs: + snapshot_kwargs["revision"] = model_kwargs["revision"] + + try: + snapshot_download( + repo_id=repo_id, + cache_dir=Path(os.getenv("HF_HOME")) / "hub", + **snapshot_kwargs, + ) + per_model_paths.append(model) + except Exception as e: + logging.debug( + f"Failed to download model {model} from Hugging Face Hub. Continuing..." + ) + logging.debug(e) + else: + snapshot_download( + repo_id=model, + cache_dir=Path(os.getenv("HF_HOME")) / "hub", + ) + per_model_paths.append(model) + + if not per_model_paths: + logging.warning( + f"Could not find any valid model for '{model}'. It will be skipped." + ) + processed_model_paths[model] = per_model_paths + + return processed_model_paths + + +def _pre_download_task_datasets( + tasks: Iterable[str], trust_remote_code: bool = True +) -> None: + processed: set[str] = set() + + misses: list[str] = [] + for task_name in tasks: + if not isinstance(task_name, str) or task_name in processed: + continue + processed.add(task_name) + if task_cache_lookup("lm-eval", task_name): + logging.info( + f"Skipping dataset preparation for task '{task_name}' (cache hit within TTL)." + ) + continue + misses.append(task_name) + + if not misses: + for task_name in processed: + if task_cache_lookup("lm-eval", task_name): + prewarm_from_payload( + task_cache_get_payload("lm-eval", task_name), + trust_remote_code=trust_remote_code, + ) + return + + from datasets import DownloadMode # type: ignore + from lm_eval.tasks import TaskManager # type: ignore + + tm = TaskManager() + + for task_name in misses: + logging.info( + f"Preparing dataset for task '{task_name}' (download if not cached)…" + ) + + task_config = { + "task": task_name, + "dataset_kwargs": {"trust_remote_code": trust_remote_code}, + } + + with capture_hf_dataset_calls() as captured_calls: + task_objects = tm.load_config(task_config) + + stack = [task_objects] + while stack: + current = stack.pop() + if isinstance(current, dict): + stack.extend(current.values()) + continue + if hasattr(current, "download") and callable(current.download): + try: + current.download( + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS + ) # type: ignore[arg-type] + except TypeError as e: + logging.error( + f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}" + ) + current.download() # type: ignore[misc] + + if captured_calls: + payload = {"calls": dedupe_calls(captured_calls)} + task_cache_set_payload("lm-eval", task_name, payload) + task_cache_mark_resolved("lm-eval", task_name) + logging.debug(f"Finished dataset preparation for task '{task_name}'.") + + +def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: + misses: list[str] = [] + processed: set[str] = set() + for t in tasks: + raw = str(t).strip() + if not raw or raw in processed: + continue + processed.add(raw) + if task_cache_lookup("lighteval", raw): + logging.info( + f"Skipping dataset preparation for LightEval task '{raw}' (cache hit within TTL)." + ) + continue + misses.append(raw) + + if not misses: + for raw in processed: + if task_cache_lookup("lighteval", raw): + prewarm_from_payload( + task_cache_get_payload("lighteval", raw), + trust_remote_code=True, + ) + return + + from lighteval.tasks.lighteval_task import LightevalTask # type: ignore + from lighteval.tasks.registry import ( # type: ignore + TRUNCATE_FEW_SHOTS_DEFAULTS, + Registry, + ) + + for raw in misses: + candidate = Path(raw) + if candidate.exists() and candidate.is_file(): + with capture_hf_dataset_calls() as captured_calls: + reg_file = Registry() + configs_file = reg_file.get_tasks_configs(str(candidate)) + task_dict_file = reg_file.get_tasks_from_configs(configs_file) + LightevalTask.load_datasets(task_dict_file) + if captured_calls: + payload = {"calls": dedupe_calls(captured_calls)} + task_cache_set_payload("lighteval", raw, payload) + task_cache_mark_resolved("lighteval", raw) + continue + + # Build single-spec string and load in isolation + spec = raw + truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) + if "|" not in spec: + spec = f"lighteval|{spec}|0|{truncate_default}" + elif spec.count("|") == 1: + spec = f"{spec}|0|{truncate_default}" + elif spec.count("|") == 2: + spec = f"{spec}|{truncate_default}" + + with capture_hf_dataset_calls() as captured_calls: + reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") + configs = reg.get_tasks_configs(spec) + task_dict = reg.get_tasks_from_configs(configs) + LightevalTask.load_datasets(task_dict) + if captured_calls: + payload = {"calls": dedupe_calls(captured_calls)} + task_cache_set_payload("lighteval", raw, payload) + task_cache_mark_resolved("lighteval", raw) + + +@contextmanager +def capture_third_party_output(verbose: bool = False): + """ + Suppresses print/logging.info/logging.debug originating from non-project modules + unless verbose=True. + + A call is considered "third-party" if its immediate caller's file path is not + under the repository root (parent of the `oellm` package directory). + """ + if verbose: + yield + return + + package_root = Path(__file__).resolve().parent + + def is_internal_stack(skip: int = 2, max_depth: int = 12) -> bool: + f = sys._getframe(skip) + depth = 0 + while f and depth < max_depth: + filename = f.f_code.co_filename if f.f_code else "" + if filename: + p = Path(filename).resolve() + if p.is_relative_to(package_root): + return True + f = f.f_back + depth += 1 + return False + + orig_print = builtins.print + orig_logger_info = logging.Logger.info + orig_logger_debug = logging.Logger.debug + orig_module_info = logging.info + orig_module_debug = logging.debug + + def filtered_print(*args, **kwargs): + if is_internal_stack(): + return orig_print(*args, **kwargs) + # third-party: drop + return None + + def filtered_logger_info(self, msg, *args, **kwargs): + if is_internal_stack(): + return orig_logger_info(self, msg, *args, **kwargs) + return None + + def filtered_logger_debug(self, msg, *args, **kwargs): + if is_internal_stack(): + return orig_logger_debug(self, msg, *args, **kwargs) + return None + + def filtered_module_info(msg, *args, **kwargs): + if is_internal_stack(): + return orig_module_info(msg, *args, **kwargs) + return None + + def filtered_module_debug(msg, *args, **kwargs): + if is_internal_stack(): + return orig_module_debug(msg, *args, **kwargs) + return None + + builtins.print = filtered_print + logging.Logger.info = filtered_logger_info # type: ignore[assignment] + logging.Logger.debug = filtered_logger_debug # type: ignore[assignment] + logging.info = filtered_module_info # type: ignore[assignment] + logging.debug = filtered_module_debug # type: ignore[assignment] + + try: + yield + finally: + builtins.print = orig_print + logging.Logger.info = orig_logger_info # type: ignore[assignment] + logging.Logger.debug = orig_logger_debug # type: ignore[assignment] + logging.info = orig_module_info # type: ignore[assignment] + logging.debug = orig_module_debug # type: ignore[assignment] + + +def capture_third_party_output_from_kwarg( + verbose_kwarg: str = "verbose", default: bool = False +): + """ + Decorator factory that wraps the function execution inside + capture_third_party_output(verbose=kwargs.get(verbose_kwarg, default)). + """ + + def _decorator(func): + @wraps(func) + def _wrapper(*args, **kwargs): + verbose_value = bool(kwargs.get(verbose_kwarg, default)) + with capture_third_party_output(verbose=verbose_value): + return func(*args, **kwargs) + + return _wrapper + + return _decorator diff --git a/pyproject.toml b/pyproject.toml index 2b7a64a..d699cba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,12 @@ dependencies = [ "questionary", ] +[project.optional-dependencies] +dev = [ + "pytest>=8.4.1", + "pre-commit", +] + [project.scripts] oellm = "oellm.main:main" @@ -28,7 +34,7 @@ build-backend = "uv_build" [tool.uv.build-backend] module-name = "oellm" module-root = "" -include = ["oellm/clusters.yaml", "oellm/task-groups.yaml"] +include = ["oellm/resources/*"] [tool.uv.sources] torch = [ @@ -72,8 +78,3 @@ quote-style = "double" indent-style = "space" skip-magic-trailing-comma = false line-ending = "auto" - -[dependency-groups] -dev = [ - "pytest>=8.4.1", -] From a97d92dde63dbab52d0e43f4209a3406ad92df2c Mon Sep 17 00:00:00 2001 From: "Timur M. Carstensen" <40788422+timurcarstensen@users.noreply.github.com> Date: Tue, 21 Oct 2025 21:33:19 +0200 Subject: [PATCH 15/39] Update README.md Co-authored-by: David Salinas --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cdc3f89..328c1aa 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ This will launch an interactive workflow where you can: - Configure n-shot settings - Preview and save your evaluation configuration -The resulting CSV now includes an additional `eval_suite` column that records which +The resulting CSV includes an additional `eval_suite` column that records which evaluation framework (e.g., `lm_eval` or `lighteval`) should be used for each task. From c9db766e60f09e548a1eb8530fc8325765a93caf Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 22:35:40 +0300 Subject: [PATCH 16/39] misc --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index cdc3f89..e00b278 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ # OpenEuroLLM CLI (oellm) -A package for running OELLM CLI workflows across multiple HPC clusters using SLURM job arrays and Singularity containers. +A package for running OELLM CLI workflows across multiple HPC clusters using SLURM job arrays and Singularity containers. ## Currently supported workflows - Schedule evaluations on multiple models and tasks on all clusters βœ… `oellm schedule-eval ...` - Restart failed evaluations (e.g., due to node failures) βœ… `oellm collect-results ... --reschedule true` - Interactive eval job/csv builder βœ… `oellm build-csv` - Recursively resolve local paths: pass a directory containing models and their nested intermediate checkpoints, will eval all checkpoints - - Support default task groups (cf `oellm/task-groups.yaml`) + - Support default task groups (cf `oellm/resources/task-groups.yaml`) ## Planned workflows - Sync and download evaluation results from all clusters via a shared data layer @@ -36,7 +36,7 @@ This will automatically: - Generate a SLURM job array to evaluate all model-task combinations - Submit the jobs with appropriate cluster-specific resource allocations -In case you meet HuggingFace quotas issues, make sure you are logged in by setting your `HF_TOKEN` and that you are part of [OpenEuroLLM](https://huggingface.co/OpenEuroLLM) organization. +In case you meet HuggingFace quotas issues, make sure you are logged in by setting your `HF_TOKEN` and that you are part of [OpenEuroLLM](https://huggingface.co/OpenEuroLLM) organization. ## Interactive CSV Builder @@ -108,7 +108,7 @@ The `oellm` package orchestrates distributed LLM evaluations through the followi ### 1. **Cluster Auto-Detection** - Automatically detects the current HPC cluster based on hostname patterns -- Loads cluster-specific configurations from [`clusters.yaml`](oellm/clusters.yaml) including: +- Loads cluster-specific configurations from [`clusters.yaml`](oellm/resources/clusters.yaml) including: - SLURM partition and account settings - Shared storage paths for models, datasets, and results - GPU allocation and queue limits From 10b26ff0ebeae019f33f6423d02b4a352669082d Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 22:37:11 +0300 Subject: [PATCH 17/39] temporarily adding AGENTS>md for development --- AGENTS.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..7e64337 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,5 @@ +Rules: +- no try...Except unless absolutely necessary +- no unnecessary comments +- don't worry about tests +- if you need to run stuff, assume there is a .venv at the root of the project. you can also just use uv From e8e3b38a96ab5f65dac3543fa52510562a2f03af Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Tue, 21 Oct 2025 23:43:28 +0200 Subject: [PATCH 18/39] fix: task caching for lighteval --- oellm/resources/clusters.yaml | 1 + oellm/resources/task-groups.yaml | 4 +- oellm/task_cache.py | 71 +++++++++++++++---------- oellm/utils.py | 91 +++++++++++++++----------------- 4 files changed, 89 insertions(+), 78 deletions(-) diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index d6da6d2..738c25a 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -6,6 +6,7 @@ shared: EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_DATASETS_DISABLE_PROGRESS_BARS: "1" leonardo: hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index f9b7684..ee081f2 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -140,6 +140,6 @@ super_groups: description: "Combined Belebele EU set plus multilingual benchmarks" task_groups: - task: flores-200-eu-to-eng - # - task: flores-200-eng-to-eu + - task: flores-200-eng-to-eu - task: belebele-eu-5-shot - # - task: global-mmlu-eu + - task: global-mmlu-eu diff --git a/oellm/task_cache.py b/oellm/task_cache.py index d8be806..2fdeea0 100644 --- a/oellm/task_cache.py +++ b/oellm/task_cache.py @@ -1,12 +1,18 @@ import json import logging from contextlib import contextmanager +from contextvars import ContextVar from datetime import datetime from pathlib import Path TASK_CACHE_TTL_DAYS = 30 +_CURRENT_CAPTURE_BUFFER: ContextVar[list[dict] | None] = ContextVar( + "_CURRENT_CAPTURE_BUFFER", default=None +) + + def get_task_cache_file() -> Path: return Path(__file__).resolve().parent / "resources" / "task_map_cache.json" @@ -147,6 +153,7 @@ def dedupe_calls(calls: list[dict]) -> list[dict]: @contextmanager def capture_hf_dataset_calls(): captured: list[dict] = [] + _buffer_token = _CURRENT_CAPTURE_BUFFER.set(captured) import datasets as _ds # type: ignore import huggingface_hub as _hfh # type: ignore @@ -173,17 +180,19 @@ def _load_dataset_proxy(path, *args, **kwargs): # noqa: ANN001 ) trust_remote_code = kwargs.get("trust_remote_code") revision = kwargs.get("revision") - captured.append( - { - "type": "load_dataset", - "path": path, - "name": name, - "data_files": data_files, - "split": split, - "revision": revision, - "trust_remote_code": trust_remote_code, - } - ) + buf = _CURRENT_CAPTURE_BUFFER.get() + if isinstance(buf, list): + buf.append( + { + "type": "load_dataset", + "path": path, + "name": name, + "data_files": data_files, + "split": split, + "revision": revision, + "trust_remote_code": trust_remote_code, + } + ) return _orig_load_dataset(path, *args, **kwargs) def _snapshot_download_proxy(*args, **kwargs): # noqa: ANN001 @@ -202,14 +211,16 @@ def _snapshot_download_proxy(*args, **kwargs): # noqa: ANN001 if "revision" in kwargs else (args[2] if len(args) > 2 else None) ) - captured.append( - { - "type": "snapshot_download", - "repo_id": repo_id, - "repo_type": repo_type, - "revision": revision, - } - ) + buf = _CURRENT_CAPTURE_BUFFER.get() + if isinstance(buf, list): + buf.append( + { + "type": "snapshot_download", + "repo_id": repo_id, + "repo_type": repo_type, + "revision": revision, + } + ) return _orig_snapshot_download(*args, **kwargs) def _hf_hub_download_proxy(*args, **kwargs): # noqa: ANN001 @@ -233,26 +244,30 @@ def _hf_hub_download_proxy(*args, **kwargs): # noqa: ANN001 if "revision" in kwargs else (args[3] if len(args) > 3 else None) ) - captured.append( - { - "type": "hf_hub_download", - "repo_id": repo_id, - "filename": filename, - "repo_type": repo_type, - "revision": revision, - } - ) + buf = _CURRENT_CAPTURE_BUFFER.get() + if isinstance(buf, list): + buf.append( + { + "type": "hf_hub_download", + "repo_id": repo_id, + "filename": filename, + "repo_type": repo_type, + "revision": revision, + } + ) return _orig_hf_hub_download(*args, **kwargs) _ds.load_dataset = _load_dataset_proxy # type: ignore[assignment] _hfh.snapshot_download = _snapshot_download_proxy # type: ignore[assignment] _hfh.hf_hub_download = _hf_hub_download_proxy # type: ignore[assignment] + try: yield captured finally: _ds.load_dataset = _orig_load_dataset # type: ignore[assignment] _hfh.snapshot_download = _orig_snapshot_download # type: ignore[assignment] _hfh.hf_hub_download = _orig_hf_hub_download # type: ignore[assignment] + _CURRENT_CAPTURE_BUFFER.reset(_buffer_token) def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True) -> None: diff --git a/oellm/utils.py b/oellm/utils.py index 63927a5..dac7a49 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -320,68 +320,57 @@ def _pre_download_task_datasets( def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: + seen: set[str] = set() misses: list[str] = [] - processed: set[str] = set() - for t in tasks: - raw = str(t).strip() - if not raw or raw in processed: + tasks = [str(task).strip() for task in tasks] + for task in tasks: + if not task or task in seen: continue - processed.add(raw) - if task_cache_lookup("lighteval", raw): + seen.add(task) + if task_cache_lookup("lighteval", task): logging.info( - f"Skipping dataset preparation for LightEval task '{raw}' (cache hit within TTL)." + f"Skipping dataset preparation for task '{task}' (cache hit within TTL)." ) continue - misses.append(raw) + misses.append(task) if not misses: - for raw in processed: - if task_cache_lookup("lighteval", raw): + for task in seen: + if task_cache_lookup("lighteval", task): prewarm_from_payload( - task_cache_get_payload("lighteval", raw), + task_cache_get_payload("lighteval", task), trust_remote_code=True, ) return - from lighteval.tasks.lighteval_task import LightevalTask # type: ignore - from lighteval.tasks.registry import ( # type: ignore - TRUNCATE_FEW_SHOTS_DEFAULTS, - Registry, - ) + for task in misses: + with capture_hf_dataset_calls() as captured_calls: + from lighteval.tasks.lighteval_task import LightevalTask + from lighteval.tasks.registry import ( + TRUNCATE_FEW_SHOTS_DEFAULTS, + Registry, + ) - for raw in misses: - candidate = Path(raw) - if candidate.exists() and candidate.is_file(): - with capture_hf_dataset_calls() as captured_calls: - reg_file = Registry() - configs_file = reg_file.get_tasks_configs(str(candidate)) - task_dict_file = reg_file.get_tasks_from_configs(configs_file) - LightevalTask.load_datasets(task_dict_file) - if captured_calls: - payload = {"calls": dedupe_calls(captured_calls)} - task_cache_set_payload("lighteval", raw, payload) - task_cache_mark_resolved("lighteval", raw) - continue + reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") + truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) - # Build single-spec string and load in isolation - spec = raw - truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) - if "|" not in spec: - spec = f"lighteval|{spec}|0|{truncate_default}" - elif spec.count("|") == 1: - spec = f"{spec}|0|{truncate_default}" - elif spec.count("|") == 2: - spec = f"{spec}|{truncate_default}" + spec = task + if "|" not in spec: + spec = f"lighteval|{spec}|0|{truncate_default}" + elif spec.count("|") == 1: + spec = f"{spec}|0|{truncate_default}" + elif spec.count("|") == 2: + spec = f"{spec}|{truncate_default}" - with capture_hf_dataset_calls() as captured_calls: - reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") configs = reg.get_tasks_configs(spec) task_dict = reg.get_tasks_from_configs(configs) LightevalTask.load_datasets(task_dict) - if captured_calls: - payload = {"calls": dedupe_calls(captured_calls)} - task_cache_set_payload("lighteval", raw, payload) - task_cache_mark_resolved("lighteval", raw) + + payload = ( + {"calls": dedupe_calls(captured_calls)} if captured_calls else {"calls": []} + ) + task_cache_set_payload("lighteval", task, payload) + task_cache_mark_resolved("lighteval", task) @contextmanager @@ -399,15 +388,21 @@ def capture_third_party_output(verbose: bool = False): package_root = Path(__file__).resolve().parent - def is_internal_stack(skip: int = 2, max_depth: int = 12) -> bool: + def is_internal_stack(skip: int = 2, max_depth: int = 20) -> bool: f = sys._getframe(skip) depth = 0 while f and depth < max_depth: - filename = f.f_code.co_filename if f.f_code else "" + code = f.f_code + filename = code.co_filename if code else "" if filename: p = Path(filename).resolve() - if p.is_relative_to(package_root): - return True + name = code.co_name if code else "" + # Skip logging internals and our filtering wrappers to find the real caller + if "/logging/__init__.py" in filename or name.startswith("filtered_"): + f = f.f_back + depth += 1 + continue + return p.is_relative_to(package_root) f = f.f_back depth += 1 return False From d8c8ed513ccf4b7a6ba1e92000edd1207544139c Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 00:55:45 +0300 Subject: [PATCH 19/39] fix --- .github/workflows/build-and-push-apptainer.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml index 197816e..5d3db37 100644 --- a/.github/workflows/build-and-push-apptainer.yml +++ b/.github/workflows/build-and-push-apptainer.yml @@ -18,7 +18,7 @@ jobs: matrix: image: [jureca, leonardo, lumi] name: Build & Publish SIF Artifact (${{ matrix.image }}) - runs-on: + runs-on: - runs-on=${{github.run_id}}/family=i7ie permissions: contents: read @@ -26,7 +26,7 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - + - name: Install Apptainer run: | sudo apt-get update @@ -37,7 +37,7 @@ jobs: - name: Build SIF from definition file run: | - apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def + apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 6" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def - name: Install Hugging Face Hub CLI run: pip install --upgrade "huggingface_hub" @@ -45,7 +45,7 @@ jobs: - name: Login to Hugging Face Hub env: HF_TOKEN: ${{ secrets.HF_TOKEN }} - run: hf auth login --token "$HF_TOKEN" + run: hf auth login --token "$HF_TOKEN" - name: Upload SIF to Hugging Face Hub env: From d37b5327074ad10a359d7ad95bd6982eed45576e Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 11:03:43 +0300 Subject: [PATCH 20/39] fix: compression algorithm --- .github/workflows/build-and-push-apptainer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-and-push-apptainer.yml b/.github/workflows/build-and-push-apptainer.yml index 5d3db37..0d1c8dd 100644 --- a/.github/workflows/build-and-push-apptainer.yml +++ b/.github/workflows/build-and-push-apptainer.yml @@ -37,7 +37,7 @@ jobs: - name: Build SIF from definition file run: | - apptainer --verbose build --mksquashfs-args="-comp zstd -Xcompression-level 6" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def + apptainer --verbose build --mksquashfs-args="-comp gzip -Xcompression-level 1" --fakeroot eval_env-${{ matrix.image }}.sif apptainer/${{ matrix.image }}.def - name: Install Hugging Face Hub CLI run: pip install --upgrade "huggingface_hub" From 79ace47c7db41a1b51d6f6cfa0a78a66c6a898e5 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 11:22:14 +0200 Subject: [PATCH 21/39] fix: updated apptainer definitions to include correct uv install --- apptainer/jureca.def | 26 ++++++++++++++------------ apptainer/leonardo.def | 25 +++++++++++++------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 23cd237..fe190ba 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -2,27 +2,29 @@ Bootstrap: docker From: nvcr.io/nvidia/pytorch:25.06-py3 %labels - Author multi-cluster-eval - Description Apptainer image for JURECA cluster (converted from dockerfile) + Author oellm-cli + Description Apptainer image for JURECA JSC cluster %post - # 1. Install uv package manager - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile + # Install uv into a global bin + curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh - # Make uv visible for subsequent commands during build - export PATH=/root/.local/bin:$PATH + # Put uv-installed tool shims in a global bin too + export UV_TOOL_BIN_DIR=/usr/local/bin + uv --version - # 2. Install Python dependencies uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate - # Install LightEval CLI in an isolated environment + # Optional: keep tool envs under /opt to avoid $HOME + export UV_TOOL_DIR=/opt/uv-tools uv tool install "lighteval[multilingual]" - + %environment - # Ensure uv is present inside the container runtime as well - export PATH=/root/.local/bin:$PATH + export PATH=/usr/local/bin:$PATH + export UV_TOOL_BIN_DIR=/usr/local/bin + export UV_TOOL_DIR=/opt/uv-tools + %runscript exec bash "$@" \ No newline at end of file diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def index 27f0eca..79f69f9 100644 --- a/apptainer/leonardo.def +++ b/apptainer/leonardo.def @@ -2,27 +2,28 @@ Bootstrap: docker From: nvcr.io/nvidia/pytorch:25.06-py3 %labels - Author multi-cluster-eval - Description Apptainer image for Leonardo cluster (converted from dockerfile) + Author oellm-cli + Description Apptainer image for Leonardo cluster %post - # 1. Install uv package manager - curl -LsSf https://astral.sh/uv/install.sh | sh - echo 'export PATH=$HOME/.local/bin:$PATH' >> /etc/profile + # Install uv into a global bin + curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh - # Make uv visible for subsequent commands during build - export PATH=/root/.local/bin:$PATH + # Put uv-installed tool shims in a global bin too + export UV_TOOL_BIN_DIR=/usr/local/bin + uv --version - # 2. Install Python dependencies uv pip install --system --break-system-packages lm-eval \ "transformers<=4.53.0" "datasets<4.0.0" wandb sentencepiece tiktoken accelerate - # Install LightEval CLI in an isolated environment + # Optional: keep tool envs under /opt to avoid $HOME + export UV_TOOL_DIR=/opt/uv-tools uv tool install "lighteval[multilingual]" - + %environment - # Ensure uv is present inside the container runtime as well - export PATH=/root/.local/bin:$PATH + export PATH=/usr/local/bin:$PATH + export UV_TOOL_BIN_DIR=/usr/local/bin + export UV_TOOL_DIR=/opt/uv-tools %runscript exec bash "$@" \ No newline at end of file From 13e985c51b3aec385b2acf6dee6872606daf5a45 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 11:35:30 +0200 Subject: [PATCH 22/39] fix: lighteval cli args --- oellm/resources/template.sbatch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index b68d637..d02a93c 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -135,8 +135,8 @@ do lighteval accelerate \ "model_name=$model_path,trust_remote_code=True" \ "$LIGHT_TASK_ARG" \ - --output_dir "$RESULTS_SUBDIR" \ - --save_details + --output-dir "$RESULTS_SUBDIR" \ + --save-details ;; *) echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping." From c9160d5eb5700fa9034609941f634f8bbbdeb04e Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 12:53:24 +0200 Subject: [PATCH 23/39] feat: wrapper to suppress tqdm output --- oellm/resources/clusters.yaml | 4 +- oellm/utils.py | 73 +++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index 738c25a..a963dad 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -5,8 +5,8 @@ shared: HF_HOME: "{EVAL_BASE_DIR}/hf_data" # where HuggingFace models and datasets are stored EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 - HF_HUB_DISABLE_PROGRESS_BARS: "1" - HF_DATASETS_DISABLE_PROGRESS_BARS: "1" + HF_HUB_DISABLE_PROGRESS_BARS: "0" + HF_DATASETS_DISABLE_PROGRESS_BARS: "0" leonardo: hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML diff --git a/oellm/utils.py b/oellm/utils.py index dac7a49..90d0660 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -26,11 +26,81 @@ ) +@contextmanager +def suppress_tqdm_rendering(enabled: bool = True): + """ + Temporarily suppresses tqdm progress bar rendering when enabled=True. + + This prevents any visual rendering by overriding the class methods + responsible for output, without altering other behavior. + """ + if not enabled: + yield + return + + import tqdm as _tqdm + from tqdm import auto as _tqdm_auto + + classes = [_tqdm.tqdm, _tqdm_auto.tqdm] + seen: set[int] = set() + patched: list[tuple[object, str, object]] = [] + + for cls in classes: + cid = id(cls) + if cid in seen: + continue + seen.add(cid) + + if hasattr(cls, "display"): + orig_display = cls.display # type: ignore[attr-defined] + + def _noop_display(self, *args, **kwargs): + return None + + cls.display = _noop_display # type: ignore[assignment] + patched.append((cls, "display", orig_display)) + + if hasattr(cls, "refresh"): + orig_refresh = cls.refresh # type: ignore[attr-defined] + + def _noop_refresh(self, *args, **kwargs): + return None + + cls.refresh = _noop_refresh # type: ignore[assignment] + patched.append((cls, "refresh", orig_refresh)) + + try: + yield + finally: + for cls, name, orig in patched: + setattr(cls, name, orig) + + +def filter_tqdm(enabled: bool = True): + """ + Decorator factory to suppress tqdm rendering for the wrapped function + when enabled=True. + """ + + def _decorator(func): + @wraps(func) + def _wrapper(*args, **kwargs): + with suppress_tqdm_rendering(enabled=enabled): + return func(*args, **kwargs) + + return _wrapper + + return _decorator + + +@filter_tqdm(enabled=False) def _ensure_singularity_image(image_name: str) -> None: from huggingface_hub import hf_hub_download image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name + logging.info(f"Downloading latest Singularity image from HuggingFace: {image_name}") + try: hf_hub_download( repo_id="openeurollm/evaluation_singularity_images", @@ -187,6 +257,7 @@ def _expand_local_model_paths(model: str) -> list[Path]: return model_paths +@filter_tqdm(enabled=True) def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: """ Processes model strings into a dict of model paths. @@ -251,6 +322,7 @@ def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: return processed_model_paths +@filter_tqdm(enabled=True) def _pre_download_task_datasets( tasks: Iterable[str], trust_remote_code: bool = True ) -> None: @@ -319,6 +391,7 @@ def _pre_download_task_datasets( logging.debug(f"Finished dataset preparation for task '{task_name}'.") +@filter_tqdm(enabled=True) def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: seen: set[str] = set() misses: list[str] = [] From ccf4c5a322800529fb6096e8abf0e31f56f977b9 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 13:18:42 +0200 Subject: [PATCH 24/39] misc --- apptainer/jureca.def | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index fe190ba..400c42b 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools - uv tool install "lighteval[multilingual]" + uv tool install "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" %environment export PATH=/usr/local/bin:$PATH From 97b3d6953c6c65753ab3300deb1aaf8d4c876669 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 13:49:54 +0200 Subject: [PATCH 25/39] fix: lighteval tool python version --- apptainer/jureca.def | 2 +- apptainer/leonardo.def | 2 +- apptainer/lumi.def | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 400c42b..68be6eb 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools - uv tool install "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" + uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" %environment export PATH=/usr/local/bin:$PATH diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def index 79f69f9..14a9576 100644 --- a/apptainer/leonardo.def +++ b/apptainer/leonardo.def @@ -18,7 +18,7 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools - uv tool install "lighteval[multilingual]" + uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" %environment export PATH=/usr/local/bin:$PATH diff --git a/apptainer/lumi.def b/apptainer/lumi.def index a7d71d7..2f7e8c4 100644 --- a/apptainer/lumi.def +++ b/apptainer/lumi.def @@ -18,7 +18,7 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools - uv tool install "lighteval[multilingual]" + uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" %environment export PATH=/usr/local/bin:$PATH From 541d3871c9fd213ba9423976e341c465d82d93b3 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 15:32:16 +0200 Subject: [PATCH 26/39] nltk setup --- apptainer/jureca.def | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 68be6eb..dc53dcc 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -19,11 +19,20 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" + uv pip install --system --break-system-packages nltk + mkdir -p /opt/nltk_data + python - <<'PY' + import nltk + nltk.download('punkt', download_dir='/opt/nltk_data') + nltk.download('punkt_tab', download_dir='/opt/nltk_data') + print('nltk data downloaded') + PY %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin export UV_TOOL_DIR=/opt/uv-tools + export NLTK_DATA=/opt/nltk_data %runscript From 006ab8d84e830042bb898d2151aec2b2cdbeffe8 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 15:38:31 +0200 Subject: [PATCH 27/39] nltk setup --- apptainer/jureca.def | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index dc53dcc..915906a 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -22,11 +22,11 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 uv pip install --system --break-system-packages nltk mkdir -p /opt/nltk_data python - <<'PY' - import nltk - nltk.download('punkt', download_dir='/opt/nltk_data') - nltk.download('punkt_tab', download_dir='/opt/nltk_data') - print('nltk data downloaded') - PY +import nltk +nltk.download('punkt', download_dir='/opt/nltk_data') +nltk.download('punkt_tab', download_dir='/opt/nltk_data') +print('nltk data downloaded') +PY %environment export PATH=/usr/local/bin:$PATH From 15bea15c6b4b4dba88baa4ade8573eb17253e946 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 17:44:20 +0200 Subject: [PATCH 28/39] fix: downloading nltk data for lighteval during container setup --- apptainer/jureca.def | 1 - apptainer/leonardo.def | 8 ++++++++ apptainer/lumi.def | 8 ++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 915906a..28a0391 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -25,7 +25,6 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 import nltk nltk.download('punkt', download_dir='/opt/nltk_data') nltk.download('punkt_tab', download_dir='/opt/nltk_data') -print('nltk data downloaded') PY %environment diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def index 14a9576..c9b2d74 100644 --- a/apptainer/leonardo.def +++ b/apptainer/leonardo.def @@ -19,11 +19,19 @@ From: nvcr.io/nvidia/pytorch:25.06-py3 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" + uv pip install --system --break-system-packages nltk + mkdir -p /opt/nltk_data + python - <<'PY' +import nltk +nltk.download('punkt', download_dir='/opt/nltk_data') +nltk.download('punkt_tab', download_dir='/opt/nltk_data') +PY %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin export UV_TOOL_DIR=/opt/uv-tools + export NLTK_DATA=/opt/nltk_data %runscript exec bash "$@" \ No newline at end of file diff --git a/apptainer/lumi.def b/apptainer/lumi.def index 2f7e8c4..815d3a1 100644 --- a/apptainer/lumi.def +++ b/apptainer/lumi.def @@ -19,11 +19,19 @@ From: rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1 # Optional: keep tool envs under /opt to avoid $HOME export UV_TOOL_DIR=/opt/uv-tools uv tool install --python 3.12 "lighteval[multilingual] @ git+https://github.com/huggingface/lighteval.git@63424f4e795ecc577b90646381b374af3a627978" + uv pip install --system --break-system-packages nltk + mkdir -p /opt/nltk_data + python - <<'PY' +import nltk +nltk.download('punkt', download_dir='/opt/nltk_data') +nltk.download('punkt_tab', download_dir='/opt/nltk_data') +PY %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin export UV_TOOL_DIR=/opt/uv-tools + export NLTK_DATA=/opt/nltk_data %runscript exec bash "$@" \ No newline at end of file From 9c97d25810689e4cdb3578a5cb1ed0574ccd5b56 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 17:44:54 +0200 Subject: [PATCH 29/39] suppressing all tqdm progress bars --- oellm/resources/clusters.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index a963dad..738c25a 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -5,8 +5,8 @@ shared: HF_HOME: "{EVAL_BASE_DIR}/hf_data" # where HuggingFace models and datasets are stored EVAL_OUTPUT_DIR: "{EVAL_BASE_DIR}/{USER}" # where evaluations are written GPUS_PER_NODE: 1 - HF_HUB_DISABLE_PROGRESS_BARS: "0" - HF_DATASETS_DISABLE_PROGRESS_BARS: "0" + HF_HUB_DISABLE_PROGRESS_BARS: "1" + HF_DATASETS_DISABLE_PROGRESS_BARS: "1" leonardo: hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML From f11d4a4c8748407495ad490ed419170c8fe988d8 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 17:46:00 +0200 Subject: [PATCH 30/39] lighteval fixes --- oellm/resources/template.sbatch | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index d02a93c..16cf705 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -122,7 +122,7 @@ do LIGHT_TASK_ARG="$LIGHT_TASK" fi else - LIGHT_TASK_ARG="${{LIGHT_TASK}}|$n_shot" + LIGHT_TASK_ARG="lighteval|${{LIGHT_TASK}}|$n_shot|0" fi fi @@ -135,8 +135,8 @@ do lighteval accelerate \ "model_name=$model_path,trust_remote_code=True" \ "$LIGHT_TASK_ARG" \ - --output-dir "$RESULTS_SUBDIR" \ - --save-details + --custom-tasks lighteval.tasks.multilingual.tasks \ + --output-dir "$RESULTS_SUBDIR" ;; *) echo "[warning] Unknown evaluation suite '$eval_suite'. Skipping." From 096cbc07c2726eac1c9168ba385bf8b48b577f01 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 17:49:46 +0200 Subject: [PATCH 31/39] misc --- oellm/resources/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 oellm/resources/__init__.py diff --git a/oellm/resources/__init__.py b/oellm/resources/__init__.py deleted file mode 100644 index e69de29..0000000 From 6e888d7bf4b98a50cf27582b0aa14d4b03121f98 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 17:49:59 +0200 Subject: [PATCH 32/39] feat: aya-expanse tasks --- oellm/resources/task-groups.yaml | 52 ++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index ee081f2..957581e 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -134,6 +134,55 @@ task_groups: - task: global_mmlu_full_tr - task: global_mmlu_full_uk - task: global_mmlu_full_he + mgsm-eu: + description: "EU Language GSM benchmarks in Aya Expanse" + suite: lm-eval-harness + n_shots: [5] + tasks: + - task: mgsm_native_cot_en + - task: mgsm_native_cot_de + - task: mgsm_native_cot_es + - task: mgsm_native_cot_fr + + generic-multilingual: + description: "Generic multilingual benchmarks in Aya Expanse" + suite: lm-eval-harness + n_shots: [0] + tasks: + - task: xwinograd + - task: xcopa + - task: xstorycloze + + include: + description: "INCLUDE benchmarks in Aya Expanse" + suite: lm-eval-harness + n_shots: [0] + tasks: + - task: include_base_44_albanian + - task: include_base_44_armenian + - task: include_base_44_azerbaijani + - task: include_base_44_basque + - task: include_base_44_belarusian + - task: include_base_44_bulgarian + - task: include_base_44_croatian + - task: include_base_44_dutch + - task: include_base_44_estonian + - task: include_base_44_finnish + - task: include_base_44_french + - task: include_base_44_georgian + - task: include_base_44_german + - task: include_base_44_greek + - task: include_base_44_hungarian + - task: include_base_44_italian + - task: include_base_44_lithuanian + - task: include_base_44_north macedonian + - task: include_base_44_polish + - task: include_base_44_portuguese + - task: include_base_44_russian + - task: include_base_44_serbian + - task: include_base_44_spanish + - task: include_base_44_turkish + - task: include_base_44_ukrainian super_groups: oellm-multilingual: @@ -143,3 +192,6 @@ super_groups: - task: flores-200-eng-to-eu - task: belebele-eu-5-shot - task: global-mmlu-eu + - task: mgsm-eu + - task: generic-multilingual + - task: include From 9d87217e3ef2496674cf58093661d1b6139e8d90 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 20:51:07 +0200 Subject: [PATCH 33/39] chore: schedule-eval logic cleanup --- oellm/main.py | 263 ++++++++++++++++--------------------------- oellm/task_cache.py | 10 +- oellm/task_groups.py | 20 +++- 3 files changed, 117 insertions(+), 176 deletions(-) diff --git a/oellm/main.py b/oellm/main.py index e04d87d..92bd230 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -1,8 +1,8 @@ import logging import os import re -import shutil import subprocess +from dataclasses import dataclass from datetime import datetime from importlib.resources import files from pathlib import Path @@ -17,6 +17,7 @@ from oellm.utils import ( _ensure_singularity_image, _expand_local_model_paths, + _filter_warnings, _load_cluster_env, _num_jobs_in_queue, _pre_download_lighteval_datasets, @@ -27,6 +28,14 @@ ) +@dataclass +class EvaluationJob: + model_path: Path | str + task_path: str + n_shot: int + eval_suite: str + + @capture_third_party_output_from_kwarg("verbose") def schedule_evals( models: str | None = None, @@ -75,16 +84,20 @@ def schedule_evals( _load_cluster_env() if not skip_checks: - image_name = os.environ.get("EVAL_CONTAINER_IMAGE") - if image_name is None: - raise ValueError( - "EVAL_CONTAINER_IMAGE is not set. Please set it in clusters.yaml." - ) - - _ensure_singularity_image(image_name) + _ensure_singularity_image(os.environ.get("EVAL_CONTAINER_IMAGE")) # type: ignore else: logging.info("Skipping container image check (--skip-checks enabled)") + if isinstance(models, str) and models is not None: + models = [m.strip() for m in models.split(",") if m.strip()] # type: ignore + + if isinstance(tasks, str) and tasks is not None: + tasks = [t.strip() for t in tasks.split(",") if t.strip()] # type: ignore + + if isinstance(n_shot, int) and n_shot is not None: + n_shot = [n_shot] + + eval_jobs: list[EvaluationJob] = [] if eval_csv_path: if models or tasks or task_groups or n_shot: raise ValueError( @@ -104,133 +117,97 @@ def schedule_evals( # Always expand local model paths, even with skip_checks df["model_path"].unique() - expanded_rows = [] - for _, row in df.iterrows(): - original_model_path = row["model_path"] - local_paths = _expand_local_model_paths(original_model_path) - if local_paths: - # Use expanded local paths - for expanded_path in local_paths: - new_row = row.copy() - new_row["model_path"] = expanded_path - expanded_rows.append(new_row) - else: - # Keep original path (might be HF model) - expanded_rows.append(row) - df = pd.DataFrame(expanded_rows) - - if "eval_suite" not in df.columns: - df["eval_suite"] = "lm_eval" - - # Download HF models only if skip_checks is False - if not skip_checks: - # Process any HF models that need downloading - hf_models = [m for m in df["model_path"].unique() if not Path(m).exists()] - if hf_models: - model_path_map = _process_model_paths(hf_models) - # Update the dataframe with processed HF models - for idx, row in df.iterrows(): - if row["model_path"] in model_path_map: - # This shouldn't expand further, just update the path - df.at[idx, "model_path"] = model_path_map[row["model_path"]][0] - else: - logging.info( - "Skipping model path processing and validation (--skip-checks enabled)" - ) - - elif models and ((tasks and n_shot is not None) or task_groups): - model_list = [m.strip() for m in models.split(",") if m.strip()] - model_paths: list[Path | str] = [] + eval_jobs.extend( + [ + EvaluationJob( + model_path=row["model_path"], + task_path=row["task_path"], + n_shot=row["n_shot"], + eval_suite=row["eval_suite"], + ) + for _, row in df.iterrows() + ] + ) - # Always expand local paths - for model in model_list: - local_paths = _expand_local_model_paths(model) - if local_paths: - model_paths.extend(local_paths) - else: - model_paths.append(model) - - # Download HF models only if skip_checks is False - if not skip_checks: - hf_models = [m for m in model_paths if not Path(m).exists()] - if hf_models: - model_path_map = _process_model_paths(hf_models) - # Replace HF model identifiers with processed paths - model_paths = [ - model_path_map[m][0] if m in model_path_map else m - for m in model_paths + elif models: + if task_groups is None: + eval_jobs.extend( + [ + EvaluationJob( + model_path=model, + task_path=task, + n_shot=shot, + eval_suite="lm_eval", + ) + for model in models + for task in tasks + for shot in n_shot ] + ) else: - logging.info( - "Skipping model path processing and validation (--skip-checks enabled)" + expanded = _expand_task_groups([g.strip() for g in task_groups.split(",")]) + eval_jobs.extend( + [ + EvaluationJob( + model_path=model, + task_path=result.task, + n_shot=result.n_shot, + eval_suite=result.suite, + ) + for model in models + for result in expanded + ] ) - rows: list[dict[str, Path | str | int]] = [] - - # Handle explicit tasks (lm_eval) with provided n_shot - if tasks: - if n_shot is None: - raise ValueError( - "When specifying `tasks`, you must also provide `n_shot`. For task groups, use `task_groups`." + expanded_eval_jobs = [] + for job in eval_jobs: + local_model_paths = _expand_local_model_paths(job.model_path) + if not local_model_paths: + expanded_eval_jobs.append(job) + else: + for path in local_model_paths: + expanded_eval_jobs.append( + EvaluationJob( + model_path=path, + task_path=job.task_path, + n_shot=job.n_shot, + eval_suite=job.eval_suite, + ) ) - tasks_list = [t.strip() for t in tasks.split(",") if t.strip()] - shots: list[int] - shots = n_shot if isinstance(n_shot, list) else [int(n_shot)] - for model_path in model_paths: - for task_name in tasks_list: - for s in shots: - rows.append( - { - "model_path": model_path, - "task_path": task_name, - "n_shot": int(s), - "eval_suite": "lm_eval", - } - ) - # Handle task groups - if task_groups: - group_names = [g.strip() for g in task_groups.split(",") if g.strip()] - # import pdb; pdb.set_trace() - expanded = _expand_task_groups(group_names) - for model_path in model_paths: - for task_name, n_shots, suite_name in expanded: - for s in n_shots: - rows.append( - { - "model_path": model_path, - "task_path": task_name, - "n_shot": int(s), - "eval_suite": suite_name, - } - ) - - df = pd.DataFrame( - rows, columns=["model_path", "task_path", "n_shot", "eval_suite"] - ) + if not skip_checks: + hub_models: set[str | Path] = { + job.model_path + for job in expanded_eval_jobs + if not Path(job.model_path).exists() + } + _process_model_paths(hub_models) else: - raise ValueError( - "Provide `eval_csv_path`, or `models` with (`tasks` and `n_shot`) and/or `task_groups`." + logging.info( + "Skipping model path processing and validation (--skip-checks enabled)" ) + # create csv + df = pd.DataFrame(expanded_eval_jobs) + if df.empty: logging.warning("No evaluation jobs to schedule.") return None + df["eval_suite"] = df["eval_suite"].str.lower() + # Ensure that all datasets required by the tasks are cached locally to avoid # network access on compute nodes. if not skip_checks: - lm_eval_tasks = df[ - df["eval_suite"].str.lower().isin({"lm_eval", "lm-eval", "lm-eval-harness"}) - ]["task_path"].unique() + lm_eval_tasks = df[df["eval_suite"].isin({"lm-eval-harness"})][ + "task_path" + ].unique() if len(lm_eval_tasks) > 0: _pre_download_task_datasets( lm_eval_tasks, trust_remote_code=trust_remote_code ) # Pre-download LightEval datasets (best-effort, incremental support) - light_eval_tasks = df[ - df["eval_suite"].str.lower().isin({"lighteval", "light-eval"}) - ]["task_path"].unique() + light_eval_tasks = df[df["eval_suite"].isin({"light-eval"})]["task_path"].unique() if len(light_eval_tasks) > 0: _pre_download_lighteval_datasets(light_eval_tasks) else: @@ -239,8 +216,9 @@ def schedule_evals( if download_only: return None - queue_limit = int(os.environ.get("QUEUE_LIMIT", 250)) - remaining_queue_capacity = queue_limit - _num_jobs_in_queue() + remaining_queue_capacity = ( + int(os.environ.get("QUEUE_LIMIT", 250)) - _num_jobs_in_queue() + ) if remaining_queue_capacity <= 0: logging.warning("No remaining queue capacity. Not scheduling any jobs.") @@ -269,61 +247,24 @@ def schedule_evals( df.to_csv(csv_path, index=False) - logging.debug(f"Saved evaluation dataframe to temporary CSV: {csv_path}") - sbatch_template = (files("oellm.resources") / "template.sbatch").read_text() # Calculate dynamic array size and time limits total_evals = len(df) - - # fixed timing estimation minutes_per_eval = 10 # Budget 10 minutes per eval total_minutes = total_evals * minutes_per_eval - - # Copy LightEval benchmark files into evaluation directory if necessary - # TODO: why do we need this? - light_eval_paths = df[df["eval_suite"].str.lower().isin({"lighteval", "light-eval"})][ - "task_path" - ].unique() - benchmark_dir = evals_dir / "light_eval_tasks" - copied_paths: dict[str, str] = {} - if light_eval_paths.size > 0: - benchmark_dir.mkdir(parents=True, exist_ok=True) - for task_path in light_eval_paths: - candidate = Path(task_path) - if candidate.exists() and candidate.is_file(): - destination = benchmark_dir / candidate.name - shutil.copy(candidate, destination) - copied_paths[str(candidate)] = str(destination) - - if copied_paths: - df.replace({"task_path": copied_paths}, inplace=True) - - # Maximum runtime per job (18 hours with safety margin) max_minutes_per_job = 18 * 60 # 18 hours min_array_size_for_time = max(1, int(np.ceil(total_minutes / max_minutes_per_job))) desired_array_size = min(128, total_evals) if total_evals >= 128 else total_evals if desired_array_size < min_array_size_for_time: desired_array_size = min_array_size_for_time - - # The actual array size is limited by queue capacity and total evals actual_array_size = min(remaining_queue_capacity, desired_array_size, total_evals) - - # Calculate actual time per job evals_per_job = max(1, int(np.ceil(total_evals / actual_array_size))) minutes_per_job = evals_per_job * minutes_per_eval - - # Add 20% safety margin and round up to nearest hour minutes_with_margin = int(minutes_per_job * 1.2) hours_with_margin = max(1, int(np.ceil(minutes_with_margin / 60))) - - # Apply 3-hour safety minimum for array jobs hours_with_margin = max(hours_with_margin, 3) - - # Cap at 24 hours hours_with_margin = min(hours_with_margin, 23) - - # Format time limit for SLURM (HH:MM:SS) time_limit = f"{hours_with_margin:02d}:59:00" # Log the calculated values @@ -343,8 +284,6 @@ def schedule_evals( ) logging.info(f" Time limit with safety margin: {time_limit}") - # replace the placeholders in the template with the actual values - # First, replace python-style placeholders sbatch_script = sbatch_template.format( csv_path=csv_path, max_array_len=max_array_len, @@ -356,13 +295,10 @@ def schedule_evals( time_limit=time_limit, # Dynamic time limit ) - # substitute any $ENV_VAR occurrences (e.g., $TIME_LIMIT) since env vars are not - # expanded in the #SBATCH directives + # substitute any $ENV_VAR occurrences sbatch_script = Template(sbatch_script).safe_substitute(os.environ) - # Save the sbatch script to the evals directory sbatch_script_path = evals_dir / "submit_evals.sbatch" - logging.debug(f"Saving sbatch script to {sbatch_script_path}") with open(sbatch_script_path, "w") as f: f.write(sbatch_script) @@ -555,7 +491,7 @@ def collect_results( # Print summary statistics if verbose: - logging.info("\nSummary:") + logging.info("Summary:") logging.info(f"Unique models: {df['model_name'].nunique()}") logging.info(f"Unique tasks: {df['task'].nunique()}") logging.info( @@ -564,7 +500,7 @@ def collect_results( # Perform check analysis if requested if check: - logging.info("\n=== Evaluation Status Check ===") + logging.info("=== Evaluation Status Check ===") # Find missing jobs missing_jobs = [] @@ -599,7 +535,7 @@ def collect_results( completed_count = len(jobs_df) - len(missing_jobs) - logging.info(f"\nTotal scheduled jobs: {len(jobs_df)}") + logging.info(f"Total scheduled jobs: {len(jobs_df)}") logging.info(f"Completed jobs: {completed_count}") logging.info(f"Missing jobs: {len(missing_jobs)}") @@ -607,14 +543,14 @@ def collect_results( missing_df = pd.DataFrame(missing_jobs) missing_csv = output_csv.replace(".csv", "_missing.csv") missing_df.to_csv(missing_csv, index=False) - logging.info(f"\nMissing jobs saved to: {missing_csv}") + logging.info(f"Missing jobs saved to: {missing_csv}") logging.info( f"You can run these with: oellm schedule-eval --eval_csv_path {missing_csv}" ) # Show some examples if verbose if verbose and len(missing_jobs) > 0: - logging.info("\nExample missing jobs:") + logging.info("Example missing jobs:") for _i, (_, job) in enumerate(missing_df.head(5).iterrows()): logging.info( f" - {job['model_path']} | {job['task_path']} | n_shot={job['n_shot']}" @@ -624,6 +560,7 @@ def collect_results( def main(): + _filter_warnings() auto_cli( { "schedule-eval": schedule_evals, diff --git a/oellm/task_cache.py b/oellm/task_cache.py index 2fdeea0..7b58e52 100644 --- a/oellm/task_cache.py +++ b/oellm/task_cache.py @@ -284,21 +284,13 @@ def prewarm_from_payload(payload: dict | None, *, trust_remote_code: bool = True if not isinstance(call, dict): continue # Unified prewarm log message - dataset_id = None if call.get("type") == "load_dataset": path = call.get("path") name = call.get("name") - dataset_id = f"{path}{'::' + name if name else ''}" else: repo_id = call.get("repo_id") filename = call.get("filename") - dataset_id = ( - f"{repo_id}{'/' + filename if filename else ''}" - if isinstance(repo_id, str) - else None - ) - if dataset_id: - logging.info(f"Prewarming dataset cache: {dataset_id}") + if call.get("type") == "snapshot_download": repo_id = call.get("repo_id") if isinstance(repo_id, str) and repo_id: diff --git a/oellm/task_groups.py b/oellm/task_groups.py index 1419de7..df3f496 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -1,3 +1,4 @@ +from typing import TypedDict from collections.abc import Iterable from dataclasses import dataclass from importlib.resources import files @@ -107,25 +108,36 @@ def _parse_task_groups( } -def _expand_task_groups(group_names: Iterable[str]) -> list[tuple[str, list[int], str]]: +@dataclass +class TaskGroupResult: + task: str + n_shot: int + suite: str + + +def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: parsed = _parse_task_groups([str(n).strip() for n in group_names if str(n).strip()]) missing = {str(n).strip() for n in group_names if str(n).strip()} - set(parsed.keys()) if missing: raise ValueError(f"Unknown task group(s): {', '.join(sorted(missing))}") - results: list[tuple[str, list[int], str]] = [] + results: list[TaskGroupResult] = [] for _, group in parsed.items(): if isinstance(group, TaskGroup): suite = group.suite for t in group.tasks: shots = [int(s) for s in (t.n_shots or [])] - results.append((t.name, shots, suite)) + for shot in shots: + results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite)) else: for g in group.task_groups: suite = g.suite for t in g.tasks: shots = [int(s) for s in (t.n_shots or [])] - results.append((t.name, shots, suite)) + for shot in shots: + results.append( + TaskGroupResult(task=t.name, n_shot=shot, suite=suite) + ) return results From 4f9f8a8e88522c63ae9a45ed0010e4f53eff7bc1 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 20:51:30 +0200 Subject: [PATCH 34/39] feat: adding spinners --- oellm/utils.py | 419 +++++++++++++++++++++++-------------------------- 1 file changed, 200 insertions(+), 219 deletions(-) diff --git a/oellm/utils.py b/oellm/utils.py index 90d0660..ef2ed5f 100644 --- a/oellm/utils.py +++ b/oellm/utils.py @@ -25,90 +25,32 @@ task_cache_set_payload, ) +_RICH_CONSOLE: Console | None = None -@contextmanager -def suppress_tqdm_rendering(enabled: bool = True): - """ - Temporarily suppresses tqdm progress bar rendering when enabled=True. - - This prevents any visual rendering by overriding the class methods - responsible for output, without altering other behavior. - """ - if not enabled: - yield - return - - import tqdm as _tqdm - from tqdm import auto as _tqdm_auto - - classes = [_tqdm.tqdm, _tqdm_auto.tqdm] - seen: set[int] = set() - patched: list[tuple[object, str, object]] = [] - - for cls in classes: - cid = id(cls) - if cid in seen: - continue - seen.add(cid) - - if hasattr(cls, "display"): - orig_display = cls.display # type: ignore[attr-defined] - - def _noop_display(self, *args, **kwargs): - return None - - cls.display = _noop_display # type: ignore[assignment] - patched.append((cls, "display", orig_display)) - - if hasattr(cls, "refresh"): - orig_refresh = cls.refresh # type: ignore[attr-defined] - - def _noop_refresh(self, *args, **kwargs): - return None - - cls.refresh = _noop_refresh # type: ignore[assignment] - patched.append((cls, "refresh", orig_refresh)) - - try: - yield - finally: - for cls, name, orig in patched: - setattr(cls, name, orig) - - -def filter_tqdm(enabled: bool = True): - """ - Decorator factory to suppress tqdm rendering for the wrapped function - when enabled=True. - """ - def _decorator(func): - @wraps(func) - def _wrapper(*args, **kwargs): - with suppress_tqdm_rendering(enabled=enabled): - return func(*args, **kwargs) +def get_console() -> Console: + global _RICH_CONSOLE + if _RICH_CONSOLE is None: + _RICH_CONSOLE = Console() + return _RICH_CONSOLE - return _wrapper - - return _decorator - -@filter_tqdm(enabled=False) def _ensure_singularity_image(image_name: str) -> None: from huggingface_hub import hf_hub_download image_path = Path(os.getenv("EVAL_BASE_DIR")) / image_name - logging.info(f"Downloading latest Singularity image from HuggingFace: {image_name}") - try: - hf_hub_download( - repo_id="openeurollm/evaluation_singularity_images", - filename=image_name, - repo_type="dataset", - local_dir=os.getenv("EVAL_BASE_DIR"), - ) - logging.info("Successfully downloaded latest Singularity image from HuggingFace") + console = get_console() + with console.status( + "Downloading latest Singularity image from HuggingFace", spinner="dots" + ): + hf_hub_download( + repo_id="openeurollm/evaluation_singularity_images", + filename=image_name, + repo_type="dataset", + local_dir=os.getenv("EVAL_BASE_DIR"), + ) except Exception as e: logging.warning( "Failed to fetch latest container image from HuggingFace: %s", str(e) @@ -121,15 +63,10 @@ def _ensure_singularity_image(image_name: str) -> None: f"Cannot proceed with evaluation scheduling." ) from e - logging.info( - "Singularity image ready at %s", - Path(os.getenv("EVAL_BASE_DIR")) / os.getenv("EVAL_CONTAINER_IMAGE"), - ) - def _setup_logging(verbose: bool = False): rich_handler = RichHandler( - console=Console(), + console=get_console(), show_time=True, log_time_format="%H:%M:%S", show_path=False, @@ -208,7 +145,7 @@ def _num_jobs_in_queue() -> int: return sum(1 for line in output.splitlines() if line.strip()) -def _expand_local_model_paths(model: str) -> list[Path]: +def _expand_local_model_paths(model: str | Path) -> list[Path]: """ Expands a local model path to include all checkpoints if it's a directory. Recursively searches for models in subdirectories. @@ -257,8 +194,7 @@ def _expand_local_model_paths(model: str) -> list[Path]: return model_paths -@filter_tqdm(enabled=True) -def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: +def _process_model_paths(models: Iterable[str]): """ Processes model strings into a dict of model paths. @@ -267,86 +203,102 @@ def _process_model_paths(models: Iterable[str]) -> dict[str, list[Path | str]]: """ from huggingface_hub import snapshot_download - processed_model_paths: dict[str, list[Path | str]] = {} + console = get_console() + models_list = list(models) - for model in models: - per_model_paths: list[Path | str] = [] + with console.status( + f"Processing models… 0/{len(models_list)}", spinner="dots" + ) as status: + for idx, model in enumerate(models_list, 1): + status.update(f"Checking model '{model}' ({idx}/{len(models_list)})") + per_model_paths: list[Path | str] = [] - local_paths = _expand_local_model_paths(model) - if local_paths: - per_model_paths.extend(local_paths) - else: - logging.info( - f"Model {model} not found locally, assuming it is a πŸ€— hub model" - ) - logging.debug( - f"Downloading model {model} on the login node since the compute nodes may not have access to the internet" - ) - - if "," in model: - model_kwargs = dict( - [kv.split("=") for kv in model.split(",") if "=" in kv] + local_paths = _expand_local_model_paths(model) + if local_paths: + per_model_paths.extend(local_paths) + status.update(f"Using local model '{model}' ({idx}/{len(models_list)})") + else: + logging.info( + f"Model {model} not found locally, assuming it is a πŸ€— hub model" ) + logging.debug( + f"Downloading model {model} on the login node since the compute nodes may not have access to the internet" + ) + + if "," in model: + model_kwargs = dict( + [kv.split("=") for kv in model.split(",") if "=" in kv] + ) - repo_id = model.split(",")[0] + repo_id = model.split(",")[0] - snapshot_kwargs = {} - if "revision" in model_kwargs: - snapshot_kwargs["revision"] = model_kwargs["revision"] + snapshot_kwargs = {} + if "revision" in model_kwargs: + snapshot_kwargs["revision"] = model_kwargs["revision"] - try: + status.update(f"Downloading '{repo_id}' ({idx}/{len(models_list)})") + try: + snapshot_download( + repo_id=repo_id, + cache_dir=Path(os.getenv("HF_HOME")) / "hub", + **snapshot_kwargs, + ) + per_model_paths.append(model) + except Exception as e: + logging.debug( + f"Failed to download model {model} from Hugging Face Hub. Continuing..." + ) + logging.debug(e) + else: + status.update(f"Downloading '{model}' ({idx}/{len(models_list)})") snapshot_download( - repo_id=repo_id, + repo_id=model, cache_dir=Path(os.getenv("HF_HOME")) / "hub", - **snapshot_kwargs, ) per_model_paths.append(model) - except Exception as e: - logging.debug( - f"Failed to download model {model} from Hugging Face Hub. Continuing..." - ) - logging.debug(e) - else: - snapshot_download( - repo_id=model, - cache_dir=Path(os.getenv("HF_HOME")) / "hub", - ) - per_model_paths.append(model) - - if not per_model_paths: - logging.warning( - f"Could not find any valid model for '{model}'. It will be skipped." - ) - processed_model_paths[model] = per_model_paths - return processed_model_paths + if not per_model_paths: + logging.warning( + f"Could not find any valid model for '{model}'. It will be skipped." + ) -@filter_tqdm(enabled=True) def _pre_download_task_datasets( tasks: Iterable[str], trust_remote_code: bool = True ) -> None: processed: set[str] = set() misses: list[str] = [] - for task_name in tasks: - if not isinstance(task_name, str) or task_name in processed: - continue - processed.add(task_name) - if task_cache_lookup("lm-eval", task_name): - logging.info( - f"Skipping dataset preparation for task '{task_name}' (cache hit within TTL)." + console = get_console() + with console.status("Checking lm-eval datasets…", spinner="dots") as status: + cache_hits = 0 + for task_name in tasks: + if not isinstance(task_name, str) or task_name in processed: + continue + processed.add(task_name) + if task_cache_lookup("lm-eval", task_name): + cache_hits += 1 + status.update( + f"Checking lm-eval datasets… {cache_hits} cached, {len(misses)} to prepare" + ) + continue + misses.append(task_name) + status.update( + f"Checking lm-eval datasets… {cache_hits} cached, {len(misses)} to prepare" ) - continue - misses.append(task_name) if not misses: - for task_name in processed: - if task_cache_lookup("lm-eval", task_name): - prewarm_from_payload( - task_cache_get_payload("lm-eval", task_name), - trust_remote_code=trust_remote_code, - ) + with console.status( + f"Using cached lm-eval datasets for {len(processed)} tasks…", + spinner="dots", + ) as status: + for task_name in processed: + if task_cache_lookup("lm-eval", task_name): + status.update(f"Loading cached dataset for '{task_name}'…") + prewarm_from_payload( + task_cache_get_payload("lm-eval", task_name), + trust_remote_code=trust_remote_code, + ) return from datasets import DownloadMode # type: ignore @@ -354,96 +306,115 @@ def _pre_download_task_datasets( tm = TaskManager() - for task_name in misses: - logging.info( - f"Preparing dataset for task '{task_name}' (download if not cached)…" - ) - - task_config = { - "task": task_name, - "dataset_kwargs": {"trust_remote_code": trust_remote_code}, - } - - with capture_hf_dataset_calls() as captured_calls: - task_objects = tm.load_config(task_config) - - stack = [task_objects] - while stack: - current = stack.pop() - if isinstance(current, dict): - stack.extend(current.values()) - continue - if hasattr(current, "download") and callable(current.download): - try: - current.download( - download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS - ) # type: ignore[arg-type] - except TypeError as e: - logging.error( - f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}" - ) - current.download() # type: ignore[misc] - - if captured_calls: - payload = {"calls": dedupe_calls(captured_calls)} - task_cache_set_payload("lm-eval", task_name, payload) - task_cache_mark_resolved("lm-eval", task_name) - logging.debug(f"Finished dataset preparation for task '{task_name}'.") + with console.status( + f"Preparing lm-eval datasets… {len(misses)} remaining", + spinner="dots", + ) as status: + for idx, task_name in enumerate(misses, 1): + status.update(f"Preparing dataset for '{task_name}' ({idx}/{len(misses)})") + + task_config = { + "task": task_name, + "dataset_kwargs": {"trust_remote_code": trust_remote_code}, + } + + with capture_hf_dataset_calls() as captured_calls: + task_objects = tm.load_config(task_config) + + stack = [task_objects] + while stack: + current = stack.pop() + if isinstance(current, dict): + stack.extend(current.values()) + continue + if hasattr(current, "download") and callable(current.download): + try: + current.download( + download_mode=DownloadMode.REUSE_DATASET_IF_EXISTS + ) # type: ignore[arg-type] + except TypeError as e: + logging.error( + f"Failed to download dataset for task '{task_name}' with download_mode=REUSE_DATASET_IF_EXISTS: {e}" + ) + current.download() # type: ignore[misc] + + if captured_calls: + payload = {"calls": dedupe_calls(captured_calls)} + task_cache_set_payload("lm-eval", task_name, payload) + task_cache_mark_resolved("lm-eval", task_name) + logging.debug(f"Finished dataset preparation for task '{task_name}'.") -@filter_tqdm(enabled=True) def _pre_download_lighteval_datasets(tasks: Iterable[str]) -> None: seen: set[str] = set() misses: list[str] = [] tasks = [str(task).strip() for task in tasks] - for task in tasks: - if not task or task in seen: - continue - seen.add(task) - if task_cache_lookup("lighteval", task): - logging.info( - f"Skipping dataset preparation for task '{task}' (cache hit within TTL)." - ) - continue - misses.append(task) - - if not misses: - for task in seen: + console = get_console() + with console.status("Checking lighteval datasets…", spinner="dots") as status: + cache_hits = 0 + for task in tasks: + if not task or task in seen: + continue + seen.add(task) if task_cache_lookup("lighteval", task): - prewarm_from_payload( - task_cache_get_payload("lighteval", task), - trust_remote_code=True, + cache_hits += 1 + status.update( + f"Checking lighteval datasets… {cache_hits} cached, {len(misses)} to prepare" ) - return - - for task in misses: - with capture_hf_dataset_calls() as captured_calls: - from lighteval.tasks.lighteval_task import LightevalTask - from lighteval.tasks.registry import ( - TRUNCATE_FEW_SHOTS_DEFAULTS, - Registry, + continue + misses.append(task) + status.update( + f"Checking lighteval datasets… {cache_hits} cached, {len(misses)} to prepare" ) - reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") - truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) - - spec = task - if "|" not in spec: - spec = f"lighteval|{spec}|0|{truncate_default}" - elif spec.count("|") == 1: - spec = f"{spec}|0|{truncate_default}" - elif spec.count("|") == 2: - spec = f"{spec}|{truncate_default}" + if not misses: + with console.status( + f"Using cached lighteval datasets for {len(seen)} tasks…", + spinner="dots", + ): + for task in seen: + if task_cache_lookup("lighteval", task): + prewarm_from_payload( + task_cache_get_payload("lighteval", task), + trust_remote_code=True, + ) + return - configs = reg.get_tasks_configs(spec) - task_dict = reg.get_tasks_from_configs(configs) - LightevalTask.load_datasets(task_dict) + with console.status( + f"Preparing lighteval datasets… {len(misses)} remaining", + spinner="dots", + ) as status: + for idx, task in enumerate(misses, 1): + status.update(f"Preparing dataset for '{task}' ({idx}/{len(misses)})") + with capture_hf_dataset_calls() as captured_calls: + from lighteval.tasks.lighteval_task import LightevalTask + from lighteval.tasks.registry import ( + TRUNCATE_FEW_SHOTS_DEFAULTS, + Registry, + ) - payload = ( - {"calls": dedupe_calls(captured_calls)} if captured_calls else {"calls": []} - ) - task_cache_set_payload("lighteval", task, payload) - task_cache_mark_resolved("lighteval", task) + reg = Registry(custom_tasks="lighteval.tasks.multilingual.tasks") + truncate_default = int(TRUNCATE_FEW_SHOTS_DEFAULTS) + + spec = task + if "|" not in spec: + spec = f"lighteval|{spec}|0|{truncate_default}" + elif spec.count("|") == 1: + spec = f"{spec}|0|{truncate_default}" + elif spec.count("|") == 2: + spec = f"{spec}|{truncate_default}" + + configs = reg.get_tasks_configs(spec) + task_dict = reg.get_tasks_from_configs(configs) + LightevalTask.load_datasets(task_dict) + + payload = ( + {"calls": dedupe_calls(captured_calls)} + if captured_calls + else {"calls": []} + ) + task_cache_set_payload("lighteval", task, payload) + task_cache_mark_resolved("lighteval", task) @contextmanager @@ -546,3 +517,13 @@ def _wrapper(*args, **kwargs): return _wrapper return _decorator + + +def _filter_warnings(): + """ + Filters warnings from the lm_eval and lighteval libraries. + """ + import warnings + + warnings.filterwarnings("ignore", module="lm_eval") + warnings.filterwarnings("ignore", module="lighteval") From fe067fac795f40b5a50cec69c812667b8b19b59d Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 20:53:27 +0200 Subject: [PATCH 35/39] chore: making pre-commit happy --- .github/workflows/ci.yml | 1 - .gitignore | 2 +- .pre-commit-config.yaml | 3 +-- apptainer/build_sif_local.sh | 2 +- apptainer/jureca.def | 4 ++-- apptainer/leonardo.def | 4 ++-- apptainer/lumi.def | 4 ++-- oellm/resources/task-groups.yaml | 4 ++-- oellm/task_cache.py | 1 - oellm/task_groups.py | 3 +-- 10 files changed, 12 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8a03dd5..6cf30a0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,4 +51,3 @@ jobs: run: | uvx yamllint . || true continue-on-error: true - diff --git a/.gitignore b/.gitignore index 9e29ad2..b897fcc 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ **/*.egg-info **/*.csv **/uv.lock -**/task_map_cache.json \ No newline at end of file +**/task_map_cache.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94ccd57..f47629e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.5.0 + rev: v6.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -18,4 +18,3 @@ repos: - id: ruff args: [--fix, --exit-non-zero-on-fix] - id: ruff-format - diff --git a/apptainer/build_sif_local.sh b/apptainer/build_sif_local.sh index 5b1fed5..6561919 100755 --- a/apptainer/build_sif_local.sh +++ b/apptainer/build_sif_local.sh @@ -28,4 +28,4 @@ for def in "${APPTAINER_DIR}"/*.def; do build_one "${def}" done -echo "\nAll SIF images built successfully. Find them under: ${OUTPUT_DIR}" \ No newline at end of file +echo "\nAll SIF images built successfully. Find them under: ${OUTPUT_DIR}" diff --git a/apptainer/jureca.def b/apptainer/jureca.def index 28a0391..7f088ad 100644 --- a/apptainer/jureca.def +++ b/apptainer/jureca.def @@ -26,7 +26,7 @@ import nltk nltk.download('punkt', download_dir='/opt/nltk_data') nltk.download('punkt_tab', download_dir='/opt/nltk_data') PY - + %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin @@ -35,4 +35,4 @@ PY %runscript - exec bash "$@" \ No newline at end of file + exec bash "$@" diff --git a/apptainer/leonardo.def b/apptainer/leonardo.def index c9b2d74..f61f282 100644 --- a/apptainer/leonardo.def +++ b/apptainer/leonardo.def @@ -26,7 +26,7 @@ import nltk nltk.download('punkt', download_dir='/opt/nltk_data') nltk.download('punkt_tab', download_dir='/opt/nltk_data') PY - + %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin @@ -34,4 +34,4 @@ PY export NLTK_DATA=/opt/nltk_data %runscript - exec bash "$@" \ No newline at end of file + exec bash "$@" diff --git a/apptainer/lumi.def b/apptainer/lumi.def index 815d3a1..c19f85f 100644 --- a/apptainer/lumi.def +++ b/apptainer/lumi.def @@ -26,7 +26,7 @@ import nltk nltk.download('punkt', download_dir='/opt/nltk_data') nltk.download('punkt_tab', download_dir='/opt/nltk_data') PY - + %environment export PATH=/usr/local/bin:$PATH export UV_TOOL_BIN_DIR=/usr/local/bin @@ -34,4 +34,4 @@ PY export NLTK_DATA=/opt/nltk_data %runscript - exec bash "$@" \ No newline at end of file + exec bash "$@" diff --git a/oellm/resources/task-groups.yaml b/oellm/resources/task-groups.yaml index 957581e..69ca6c8 100644 --- a/oellm/resources/task-groups.yaml +++ b/oellm/resources/task-groups.yaml @@ -143,7 +143,7 @@ task_groups: - task: mgsm_native_cot_de - task: mgsm_native_cot_es - task: mgsm_native_cot_fr - + generic-multilingual: description: "Generic multilingual benchmarks in Aya Expanse" suite: lm-eval-harness @@ -153,7 +153,7 @@ task_groups: - task: xcopa - task: xstorycloze - include: + include: description: "INCLUDE benchmarks in Aya Expanse" suite: lm-eval-harness n_shots: [0] diff --git a/oellm/task_cache.py b/oellm/task_cache.py index 7b58e52..a320bee 100644 --- a/oellm/task_cache.py +++ b/oellm/task_cache.py @@ -1,5 +1,4 @@ import json -import logging from contextlib import contextmanager from contextvars import ContextVar from datetime import datetime diff --git a/oellm/task_groups.py b/oellm/task_groups.py index df3f496..73c7d35 100644 --- a/oellm/task_groups.py +++ b/oellm/task_groups.py @@ -1,4 +1,3 @@ -from typing import TypedDict from collections.abc import Iterable from dataclasses import dataclass from importlib.resources import files @@ -128,7 +127,7 @@ def _expand_task_groups(group_names: Iterable[str]) -> list[TaskGroupResult]: suite = group.suite for t in group.tasks: shots = [int(s) for s in (t.n_shots or [])] - for shot in shots: + for shot in shots: results.append(TaskGroupResult(task=t.name, n_shot=shot, suite=suite)) else: for g in group.task_groups: From f552c96e4ff213d8b0f745587957ff6f3e62ce0d Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 21:01:44 +0200 Subject: [PATCH 36/39] misc --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6cf30a0..0d9fa48 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: - uses: actions/checkout@v4 - name: Install uv - uses: astral-sh/setup-uv@v3 + uses: astral-sh/setup-uv@v7 with: version: "latest" @@ -40,7 +40,7 @@ jobs: - uses: actions/checkout@v4 - name: Install uv - uses: astral-sh/setup-uv@v3 + uses: astral-sh/setup-uv@v7 with: version: "latest" From 9bbf5c10083da83c3ff22318aa3cfe93fc9b46bc Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Wed, 22 Oct 2025 23:21:42 +0200 Subject: [PATCH 37/39] fix: restrict model parallel --- oellm/resources/template.sbatch | 1 + 1 file changed, 1 insertion(+) diff --git a/oellm/resources/template.sbatch b/oellm/resources/template.sbatch index 16cf705..de1aa69 100644 --- a/oellm/resources/template.sbatch +++ b/oellm/resources/template.sbatch @@ -131,6 +131,7 @@ do singularity exec $SINGULARITY_ARGS \ --bind $BIND_PATHS \ + --env CUDA_VISIBLE_DEVICES=$SLURM_GPUS_PER_NODE \ $EVAL_SIF_PATH \ lighteval accelerate \ "model_name=$model_path,trust_remote_code=True" \ From 1b81460cd217acfc6e5cc3aee9cc20498fc4b0bd Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Thu, 23 Oct 2025 09:39:11 +0200 Subject: [PATCH 38/39] fix: result collection --- oellm/main.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/oellm/main.py b/oellm/main.py index 92bd230..c1ffac2 100644 --- a/oellm/main.py +++ b/oellm/main.py @@ -434,20 +434,107 @@ def collect_results( results = data.get("results", {}) n_shot_data = data.get("n-shot", {}) + # Infer a global n_shot if exactly one unique value exists in this JSON + global_n_shot = None + try: + candidate_values = [] + for _v in n_shot_data.values(): + if isinstance(_v, (int | float)): + candidate_values.append(int(_v)) + elif isinstance(_v, str) and _v.isdigit(): + candidate_values.append(int(_v)) + unique_values = set(candidate_values) + if len(unique_values) == 1: + global_n_shot = next(iter(unique_values)) + except Exception: + pass + + # Aggregate groups (lm-eval harness) + groups_map = data.get("groups", {}) + group_subtasks_map = data.get("group_subtasks", {}) + group_aggregate_names = set(groups_map.keys()) | set(group_subtasks_map.keys()) + group_subtask_names: set[str] = set() + for _agg, _subs in group_subtasks_map.items(): + for _s in _subs: + group_subtask_names.add(_s) + + # Prefer only the first aggregate metric from groups (simplified) + if groups_map: + group_name, group_results = next(iter(groups_map.items())) + n_shot = n_shot_data.get(group_name, "unknown") + if n_shot == "unknown": + for subtask_name in group_subtasks_map.get(group_name, []): + if subtask_name in n_shot_data: + n_shot = n_shot_data[subtask_name] + break + if n_shot == "unknown" and global_n_shot is not None: + n_shot = global_n_shot + performance = group_results.get("acc,none") + if performance is None: + for metric in ["acc", "accuracy", "f1", "exact_match"]: + if metric in group_results: + performance = group_results[metric] + break + if performance is not None: + if check: + completed_jobs.add((model_name, group_name, n_shot)) + rows.append( + { + "model_name": model_name, + "task": group_name, + "n_shot": n_shot, + "performance": performance, + } + ) + # Skip per-task iteration when groups are present + continue + for task_name, task_results in results.items(): + # Skip entries already added from groups + if groups_map and task_name in group_aggregate_names: + continue + # Skip any lm-eval group subtasks; keep only aggregates + if task_name in group_subtask_names: + continue + # Skip MMLU subtasks - only keep the aggregate score if task_name.startswith("mmlu_") and task_name != "mmlu": continue + # Skip Global MMLU subtasks - keep only aggregates like global_mmlu_full_pt + if task_name.startswith("global_mmlu_") and task_name.count("_") >= 4: + continue + # Get n_shot for this task n_shot = n_shot_data.get(task_name, "unknown") + # If this is a group aggregate and n_shot is missing, derive from any subtask + if task_name in group_aggregate_names and n_shot == "unknown": + for subtask_name in group_subtasks_map.get(task_name, []): + if subtask_name in n_shot_data: + n_shot = n_shot_data[subtask_name] + break + if n_shot == "unknown" and global_n_shot is not None: + n_shot = global_n_shot + # Special handling for MMLU aggregate - get n_shot from any MMLU subtask if task_name == "mmlu" and n_shot == "unknown": for key, value in n_shot_data.items(): if key.startswith("mmlu_"): n_shot = value break + if n_shot == "unknown" and global_n_shot is not None: + n_shot = global_n_shot + + # Special handling for Global MMLU aggregates - get n_shot from subtasks + if task_name.startswith("global_mmlu_") and n_shot == "unknown": + prefix = f"{task_name}_" + for key, value in n_shot_data.items(): + if key.startswith(prefix): + n_shot = value + break + if n_shot == "unknown" and global_n_shot is not None: + n_shot = global_n_shot # Get the primary metric (usually acc,none) performance = task_results.get("acc,none") From c3e0b41787530a1999a2a3deaea4cdddcf414851 Mon Sep 17 00:00:00 2001 From: timurcarstensen Date: Thu, 23 Oct 2025 09:55:06 +0200 Subject: [PATCH 39/39] fix: leonardo directory --- oellm/resources/clusters.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oellm/resources/clusters.yaml b/oellm/resources/clusters.yaml index 738c25a..0fa3f60 100644 --- a/oellm/resources/clusters.yaml +++ b/oellm/resources/clusters.yaml @@ -10,7 +10,7 @@ shared: leonardo: hostname_pattern: "*.leonardo.local" # use this regexp to automatically assign environment variables corresponding to this YAML - EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/shared_evals" + EVAL_BASE_DIR: "/leonardo_work/AIFAC_L01_028/oellm-cli-shared-evals" PARTITION: "boost_usr_prod" # default partition to use ACCOUNT: "AIFAC_L01_028" # default account to use QUEUE_LIMIT: 1000 # maximum number of jobs that can be submitted as job/array, used to send only jobs that respects QOS @@ -28,7 +28,7 @@ jureca: lumi: hostname_pattern: "uan*" - EVAL_BASE_DIR: "/pfs/lustrep4/scratch/project_462000963/shared_evals" + EVAL_BASE_DIR: "/pfs/lustrep4/scratch/project_462000963/oellm-cli-shared-evals" PARTITION: "small-g" ACCOUNT: "project_462000963" QUEUE_LIMIT: 210