From c994966e99ef66fee1513a790ff59f0f0dfdca8b Mon Sep 17 00:00:00 2001 From: Luca Lanzilao Date: Wed, 17 Jun 2026 13:54:33 +0200 Subject: [PATCH 1/2] make environment squash optional --- src/evalml/config.py | 5 ++++ workflow/rules/inference.smk | 43 ++++++++++++++++++++++++++----- workflow/tools/config.schema.json | 12 +++++++++ 3 files changed, 53 insertions(+), 7 deletions(-) diff --git a/src/evalml/config.py b/src/evalml/config.py index bbe94543..40d4fceb 100644 --- a/src/evalml/config.py +++ b/src/evalml/config.py @@ -101,6 +101,11 @@ class RunConfig(BaseModel): False, description="If true, the ECCODES_DEFINITION_PATH environment variable will not be set to the COSMO local definitions.", ) + skip_env_squashfs: bool = Field( + False, + description="If true, skip creating a squashfs image and activate the virtual environment directly. " + "Useful when squashfs creation is too slow and a quick iteration is preferred.", + ) config: Dict[str, Any] | str diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk index 2d770556..5058c8c5 100644 --- a/workflow/rules/inference.smk +++ b/workflow/rules/inference.smk @@ -267,8 +267,11 @@ def _inference_routing_fn(wc): rule inference_execute: input: okfile=_inference_routing_fn, - image=lambda wc: OUT_ROOT - / f"data/runs/{RUN_CONFIGS[wc.run_id]['env_id']}/venv.squashfs", + env=lambda wc: ( + OUT_ROOT / f"data/runs/{RUN_CONFIGS[wc.run_id]['env_id']}/venv.squashfs" + if not RUN_CONFIGS[wc.run_id].get("skip_env_squashfs", False) + else OUT_ROOT / f"data/runs/{RUN_CONFIGS[wc.run_id]['env_id']}/.venv" + ), output: okfile=touch(OUT_ROOT / "logs/inference_execute/{run_id}-{init_time}.ok"), log: @@ -283,7 +286,8 @@ rule inference_execute: ntasks=lambda wc: get_resource(wc, "tasks", 1), gpus=lambda wc: get_resource(wc, "gpu", 1), params: - image_path=lambda wc, input: f"{Path(input.image).resolve()}", + env_path=lambda wc, input: f"{Path(input.env).resolve()}", + skip_env_squashfs=lambda wc: RUN_CONFIGS[wc.run_id].get("skip_env_squashfs", False), workdir=lambda wc: ( OUT_ROOT / f"data/runs/{wc.run_id}/{wc.init_time}" ).resolve(), @@ -298,11 +302,11 @@ rule inference_execute: cd {params.workdir} - squashfs-mount {params.image_path}:/user-environment -- bash -c ' - source /user-environment/bin/activate + if [ "{params.skip_env_squashfs}" = "True" ]; then + source {params.env_path}/bin/activate if [ "{params.disable_local_definitions}" = "False" ]; then - export ECCODES_DEFINITION_PATH=/user-environment/share/eccodes-cosmo-resources/definitions + export ECCODES_DEFINITION_PATH={params.env_path}/share/eccodes-cosmo-resources/definitions fi CMD_ARGS=() @@ -321,7 +325,32 @@ rule inference_execute: --gres={resources.gres} \ --ntasks={resources.ntasks} \ anemoi-inference run config.yaml "${{CMD_ARGS[@]}}" - ' + else + squashfs-mount {params.env_path}:/user-environment -- bash -c ' + source /user-environment/bin/activate + + if [ "{params.disable_local_definitions}" = "False" ]; then + export ECCODES_DEFINITION_PATH=/user-environment/share/eccodes-cosmo-resources/definitions + fi + + CMD_ARGS=() + + # is GPU > 1, add parallel flag to CMD_ARGS and override automatic cluster detection + if [ {resources.gpus} -gt 1 ]; then + CMD_ARGS+=(runner.parallel.cluster=slurm) + fi + + srun \ + --unbuffered \ + --partition={resources.slurm_partition} \ + --cpus-per-task={resources.cpus_per_task} \ + --mem-per-cpu={resources.mem_mb_per_cpu} \ + --time={resources.runtime} \ + --gres={resources.gres} \ + --ntasks={resources.ntasks} \ + anemoi-inference run config.yaml "${{CMD_ARGS[@]}}" + ' + fi ) >{log} 2>&1 """ # fmt: on diff --git a/workflow/tools/config.schema.json b/workflow/tools/config.schema.json index 03671aa1..4ea8abc6 100644 --- a/workflow/tools/config.schema.json +++ b/workflow/tools/config.schema.json @@ -381,6 +381,12 @@ "title": "Disable Local Eccodes Definitions", "type": "boolean" }, + "skip_env_squashfs": { + "default": false, + "description": "If true, skip creating a squashfs image and activate the virtual environment directly. Useful when squashfs creation is too slow and a quick iteration is preferred.", + "title": "Skip Env Squashfs", + "type": "boolean" + }, "config": { "anyOf": [ { @@ -733,6 +739,12 @@ "title": "Disable Local Eccodes Definitions", "type": "boolean" }, + "skip_env_squashfs": { + "default": false, + "description": "If true, skip creating a squashfs image and activate the virtual environment directly. Useful when squashfs creation is too slow and a quick iteration is preferred.", + "title": "Skip Env Squashfs", + "type": "boolean" + }, "config": { "anyOf": [ { From cc8bd5d0ea09c472b7c03cade2bbf73a42177235 Mon Sep 17 00:00:00 2001 From: Luca Lanzilao Date: Wed, 17 Jun 2026 17:06:42 +0200 Subject: [PATCH 2/2] Refactor and rename venv squashing flag --- src/evalml/config.py | 8 +++--- workflow/rules/inference.smk | 41 +++++++++---------------------- workflow/tools/config.schema.json | 16 ++++++------ 3 files changed, 24 insertions(+), 41 deletions(-) diff --git a/src/evalml/config.py b/src/evalml/config.py index 40d4fceb..63df081a 100644 --- a/src/evalml/config.py +++ b/src/evalml/config.py @@ -101,10 +101,10 @@ class RunConfig(BaseModel): False, description="If true, the ECCODES_DEFINITION_PATH environment variable will not be set to the COSMO local definitions.", ) - skip_env_squashfs: bool = Field( - False, - description="If true, skip creating a squashfs image and activate the virtual environment directly. " - "Useful when squashfs creation is too slow and a quick iteration is preferred.", + squash_venv: bool = Field( + True, + description="If true (default), package the virtual environment as a squashfs image before running inference. " + "Set to false to activate the virtual environment directly and skip the squashfs build step.", ) config: Dict[str, Any] | str diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk index 5058c8c5..31af883f 100644 --- a/workflow/rules/inference.smk +++ b/workflow/rules/inference.smk @@ -269,7 +269,7 @@ rule inference_execute: okfile=_inference_routing_fn, env=lambda wc: ( OUT_ROOT / f"data/runs/{RUN_CONFIGS[wc.run_id]['env_id']}/venv.squashfs" - if not RUN_CONFIGS[wc.run_id].get("skip_env_squashfs", False) + if RUN_CONFIGS[wc.run_id].get("squash_venv", True) else OUT_ROOT / f"data/runs/{RUN_CONFIGS[wc.run_id]['env_id']}/.venv" ), output: @@ -287,7 +287,7 @@ rule inference_execute: gpus=lambda wc: get_resource(wc, "gpu", 1), params: env_path=lambda wc, input: f"{Path(input.env).resolve()}", - skip_env_squashfs=lambda wc: RUN_CONFIGS[wc.run_id].get("skip_env_squashfs", False), + squash_venv=lambda wc: RUN_CONFIGS[wc.run_id].get("squash_venv", True), workdir=lambda wc: ( OUT_ROOT / f"data/runs/{wc.run_id}/{wc.init_time}" ).resolve(), @@ -302,11 +302,12 @@ rule inference_execute: cd {params.workdir} - if [ "{params.skip_env_squashfs}" = "True" ]; then - source {params.env_path}/bin/activate + _run_inference() {{ + local VENV=$1 + source "$VENV/bin/activate" if [ "{params.disable_local_definitions}" = "False" ]; then - export ECCODES_DEFINITION_PATH={params.env_path}/share/eccodes-cosmo-resources/definitions + export ECCODES_DEFINITION_PATH="$VENV/share/eccodes-cosmo-resources/definitions" fi CMD_ARGS=() @@ -325,31 +326,13 @@ rule inference_execute: --gres={resources.gres} \ --ntasks={resources.ntasks} \ anemoi-inference run config.yaml "${{CMD_ARGS[@]}}" + }} + export -f _run_inference + + if [ "{params.squash_venv}" = "True" ]; then + squashfs-mount {params.env_path}:/user-environment -- bash -c '_run_inference /user-environment' else - squashfs-mount {params.env_path}:/user-environment -- bash -c ' - source /user-environment/bin/activate - - if [ "{params.disable_local_definitions}" = "False" ]; then - export ECCODES_DEFINITION_PATH=/user-environment/share/eccodes-cosmo-resources/definitions - fi - - CMD_ARGS=() - - # is GPU > 1, add parallel flag to CMD_ARGS and override automatic cluster detection - if [ {resources.gpus} -gt 1 ]; then - CMD_ARGS+=(runner.parallel.cluster=slurm) - fi - - srun \ - --unbuffered \ - --partition={resources.slurm_partition} \ - --cpus-per-task={resources.cpus_per_task} \ - --mem-per-cpu={resources.mem_mb_per_cpu} \ - --time={resources.runtime} \ - --gres={resources.gres} \ - --ntasks={resources.ntasks} \ - anemoi-inference run config.yaml "${{CMD_ARGS[@]}}" - ' + _run_inference "{params.env_path}" fi ) >{log} 2>&1 """ diff --git a/workflow/tools/config.schema.json b/workflow/tools/config.schema.json index 4ea8abc6..f0d7b9cb 100644 --- a/workflow/tools/config.schema.json +++ b/workflow/tools/config.schema.json @@ -381,10 +381,10 @@ "title": "Disable Local Eccodes Definitions", "type": "boolean" }, - "skip_env_squashfs": { - "default": false, - "description": "If true, skip creating a squashfs image and activate the virtual environment directly. Useful when squashfs creation is too slow and a quick iteration is preferred.", - "title": "Skip Env Squashfs", + "squash_venv": { + "default": true, + "description": "If true (default), package the virtual environment as a squashfs image before running inference. Set to false to activate the virtual environment directly and skip the squashfs build step.", + "title": "Squash Venv", "type": "boolean" }, "config": { @@ -739,10 +739,10 @@ "title": "Disable Local Eccodes Definitions", "type": "boolean" }, - "skip_env_squashfs": { - "default": false, - "description": "If true, skip creating a squashfs image and activate the virtual environment directly. Useful when squashfs creation is too slow and a quick iteration is preferred.", - "title": "Skip Env Squashfs", + "squash_venv": { + "default": true, + "description": "If true (default), package the virtual environment as a squashfs image before running inference. Set to false to activate the virtual environment directly and skip the squashfs build step.", + "title": "Squash Venv", "type": "boolean" }, "config": {