From b54cce66b9e4c17963e6db56a923cd231679eb91 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 2 Apr 2026 16:46:21 -0700 Subject: [PATCH 01/35] Added new feature to execute workflows without a kubernetes cluster, for faster iteration when developing workflows --- src/cli/local.py | 72 ++ src/cli/main_parser.py | 4 +- src/utils/BUILD | 10 + src/utils/local_executor.py | 299 +++++ src/utils/tests/BUILD | 10 + src/utils/tests/test_local_executor.py | 1406 ++++++++++++++++++++++++ 6 files changed, 1800 insertions(+), 1 deletion(-) create mode 100644 src/cli/local.py create mode 100644 src/utils/local_executor.py create mode 100644 src/utils/tests/test_local_executor.py diff --git a/src/cli/local.py b/src/cli/local.py new file mode 100644 index 000000000..a9f12eaaf --- /dev/null +++ b/src/cli/local.py @@ -0,0 +1,72 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import argparse +import sys + +import shtab + +from src.utils import local_executor + + +def setup_parser(parser: argparse._SubParsersAction): + local_parser = parser.add_parser( + 'local', + help='Run workflows locally using Docker (no Kubernetes cluster required).') + subparsers = local_parser.add_subparsers(dest='command') + subparsers.required = True + + run_parser = subparsers.add_parser( + 'run', + help='Execute a workflow spec locally using Docker containers.') + run_parser.add_argument( + '-f', '--file', + required=True, + dest='workflow_file', + help='Path to the workflow YAML spec file.').complete = shtab.FILE + run_parser.add_argument( + '--work-dir', + dest='work_dir', + default=None, + help='Directory for task inputs/outputs. Defaults to a temporary directory.') + run_parser.add_argument( + '--keep', + action='store_true', + default=False, + help='Keep the work directory after execution (always kept on failure).') + run_parser.add_argument( + '--docker', + dest='docker_cmd', + default='docker', + help='Docker-compatible command to use (e.g. podman). Default: docker.') + run_parser.set_defaults(func=_run_local) + + +def _run_local(service_client, args: argparse.Namespace): + try: + success = local_executor.run_workflow_locally( + spec_path=args.workflow_file, + work_dir=args.work_dir, + keep_work_dir=args.keep, + ) + except ValueError as error: + print(f'Error: {error}', file=sys.stderr) + sys.exit(1) + + if not success: + sys.exit(1) diff --git a/src/cli/main_parser.py b/src/cli/main_parser.py index 79484ee16..bd097111d 100644 --- a/src/cli/main_parser.py +++ b/src/cli/main_parser.py @@ -28,6 +28,7 @@ credential, data, dataset, + local, login, pool, profile, @@ -55,7 +56,8 @@ profile.setup_parser, pool.setup_parser, user.setup_parser, - config.setup_parser + config.setup_parser, + local.setup_parser, ) diff --git a/src/utils/BUILD b/src/utils/BUILD index 77f45aafb..8a29aa9af 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -126,3 +126,13 @@ osmo_py_library( ], visibility = ["//visibility:public"], ) + +osmo_py_library( + name = "local_executor", + srcs = ["local_executor.py"], + deps = [ + requirement("pyyaml"), + "//src/utils/job", + ], + visibility = ["//visibility:public"], +) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py new file mode 100644 index 000000000..c1926db34 --- /dev/null +++ b/src/utils/local_executor.py @@ -0,0 +1,299 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import dataclasses +import logging +import os +import re +import shutil +import subprocess +import tempfile +from typing import Dict, List, Set + +import yaml + +from src.utils.job import task as task_module +from src.utils.job import workflow as workflow_module + + +logger = logging.getLogger(__name__) + + +@dataclasses.dataclass +class TaskNode: + name: str + spec: task_module.TaskSpec + group: str + upstream: Set[str] = dataclasses.field(default_factory=set) + downstream: Set[str] = dataclasses.field(default_factory=set) + + +@dataclasses.dataclass +class TaskResult: + name: str + exit_code: int + output_dir: str + + +class LocalExecutor: + """ + Executes an OSMO workflow spec locally using Docker, without Kubernetes. + + Supports: + - Serial and parallel task DAGs (groups flattened to individual tasks) + - {{output}} and {{input:N}} / {{input:taskname}} token substitution + - Inline `files:` written to the container + - `environment:` passed as Docker env vars + - Task-to-task data flow via shared local directories + + Does NOT support (raises clear errors): + - Dataset / URL inputs/outputs (require object storage) + - Credentials, checkpoints, volumeMounts (require cluster infra) + - Templated specs with Jinja (require server-side expansion; use --dry-run first) + """ + + def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker'): + self._work_dir = work_dir + self._keep_work_dir = keep_work_dir + self._docker_cmd = docker_cmd + self._task_nodes: Dict[str, TaskNode] = {} + self._results: Dict[str, TaskResult] = {} + + def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: + raw = yaml.safe_load(spec_text) + versioned = workflow_module.VersionedWorkflowSpec(**raw) + return versioned.workflow + + def execute(self, spec: workflow_module.WorkflowSpec) -> bool: + self._build_dag(spec) + self._validate_for_local(spec) + self._setup_directories() + + logger.info('Workflow "%s": %d task(s) across %d group(s)', + spec.name, sum(len(g.tasks) for g in self._groups(spec)), len(self._groups(spec))) + + ready = self._find_ready_tasks() + while ready: + for task_name in ready: + node = self._task_nodes[task_name] + logger.info('--- Running task: %s (image: %s) ---', task_name, node.spec.image) + result = self._run_task(node, spec) + self._results[task_name] = result + + if result.exit_code != 0: + logger.error('Task "%s" failed with exit code %d', task_name, result.exit_code) + self._cancel_downstream(task_name) + return False + + logger.info('Task "%s" completed successfully', task_name) + + ready = self._find_ready_tasks() + + failed = [name for name, r in self._results.items() if r.exit_code != 0] + if failed: + logger.error('Workflow failed. Failed tasks: %s', ', '.join(failed)) + return False + + logger.info('Workflow "%s" completed successfully', spec.name) + return True + + def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGroupSpec]: + if spec.groups: + return spec.groups + return [task_module.TaskGroupSpec(name=t.name, tasks=[t]) for t in spec.tasks] + + def _build_dag(self, spec: workflow_module.WorkflowSpec): + self._task_nodes.clear() + task_to_group: Dict[str, str] = {} + + for group in self._groups(spec): + for task_spec in group.tasks: + task_to_group[task_spec.name] = group.name + self._task_nodes[task_spec.name] = TaskNode( + name=task_spec.name, + spec=task_spec, + group=group.name, + ) + + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.TaskInputOutput): + upstream_task = input_source.task + if upstream_task not in self._task_nodes: + raise ValueError( + f'Task "{task_spec.name}" depends on unknown task "{upstream_task}"') + self._task_nodes[task_spec.name].upstream.add(upstream_task) + self._task_nodes[upstream_task].downstream.add(task_spec.name) + + def _validate_for_local(self, spec: workflow_module.WorkflowSpec): + unsupported_features = [] + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.DatasetInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": dataset inputs require object storage') + elif isinstance(input_source, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL inputs require network/storage access') + + for output in task_spec.outputs: + if isinstance(output, (task_module.DatasetInputOutput, task_module.URLInputOutput)): + unsupported_features.append( + f'Task "{task_spec.name}": dataset/URL outputs require object storage') + + if task_spec.credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credentials require the OSMO secret manager') + + if task_spec.checkpoint: + unsupported_features.append( + f'Task "{task_spec.name}": checkpoints require object storage') + + if task_spec.volumeMounts: + unsupported_features.append( + f'Task "{task_spec.name}": volumeMounts require cluster-level host paths') + + if unsupported_features: + raise ValueError( + 'The following features are not supported in local execution mode:\n - ' + + '\n - '.join(unsupported_features)) + + def _setup_directories(self): + os.makedirs(self._work_dir, exist_ok=True) + for task_name in self._task_nodes: + os.makedirs(os.path.join(self._work_dir, task_name, 'output'), exist_ok=True) + + def _find_ready_tasks(self) -> List[str]: + completed = set(self._results.keys()) + ready = [] + for name, node in self._task_nodes.items(): + if name in completed: + continue + if node.upstream.issubset(completed): + all_upstream_ok = all(self._results[u].exit_code == 0 for u in node.upstream) + if all_upstream_ok: + ready.append(name) + return ready + + def _cancel_downstream(self, failed_task: str): + visited: Set[str] = set() + queue = [failed_task] + while queue: + current = queue.pop(0) + for downstream in self._task_nodes[current].downstream: + if downstream not in visited and downstream not in self._results: + visited.add(downstream) + self._results[downstream] = TaskResult( + name=downstream, exit_code=-1, output_dir='') + queue.append(downstream) + + def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskResult: + task_spec = node.spec + task_dir = os.path.join(self._work_dir, node.name) + output_dir = os.path.join(task_dir, 'output') + files_dir = os.path.join(task_dir, 'files') + os.makedirs(files_dir, exist_ok=True) + + token_map = self._build_token_map(node, output_dir) + + for file_spec in task_spec.files: + resolved_contents = self._substitute_tokens(file_spec.contents, token_map) + host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) + os.makedirs(os.path.dirname(host_path), exist_ok=True) + with open(host_path, 'w') as f: + f.write(resolved_contents) + + resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] + resolved_args = [self._substitute_tokens(a, token_map) for a in task_spec.args] + + docker_args = [self._docker_cmd, 'run', '--rm'] + + for key, value in task_spec.environment.items(): + resolved_value = self._substitute_tokens(value, token_map) + docker_args += ['-e', f'{key}={resolved_value}'] + + docker_args += ['-v', f'{output_dir}:{output_dir}'] + + for index, input_source in enumerate(task_spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + upstream_result = self._results[input_source.task] + input_mount = token_map.get(f'input:{index}', upstream_result.output_dir) + docker_args += ['-v', f'{upstream_result.output_dir}:{input_mount}:ro'] + + for file_spec in task_spec.files: + host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) + docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] + + docker_args.append(task_spec.image) + docker_args += resolved_command + resolved_args + + logger.debug('Docker command: %s', ' '.join(docker_args)) + + try: + process = subprocess.run(docker_args, capture_output=False) + return TaskResult(name=node.name, exit_code=process.returncode, output_dir=output_dir) + except FileNotFoundError: + logger.error('Docker not found. Is Docker installed and in your PATH?') + return TaskResult(name=node.name, exit_code=127, output_dir=output_dir) + + def _build_token_map(self, node: TaskNode, output_dir: str) -> Dict[str, str]: + tokens: Dict[str, str] = { + 'output': output_dir, + } + for index, input_source in enumerate(node.spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + upstream_result = self._results[input_source.task] + tokens[f'input:{input_source.task}'] = upstream_result.output_dir + tokens[f'input:{index}'] = upstream_result.output_dir + return tokens + + def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: + for key, value in tokens.items(): + text = re.sub(r'\{\{\s*' + re.escape(key) + r'\s*\}\}', value, text) + return text + + +def run_workflow_locally(spec_path: str, work_dir: str | None = None, + keep_work_dir: bool = False) -> bool: + if work_dir is None: + work_dir = tempfile.mkdtemp(prefix='osmo-local-') + logger.info('Using temporary work directory: %s', work_dir) + + with open(spec_path) as f: + spec_text = f.read() + + template_markers = ('{%%', '{#', 'default-values') + if any(marker in spec_text for marker in template_markers): + raise ValueError( + 'This spec uses Jinja templates which require server-side expansion.\n' + 'Run "osmo workflow submit --dry-run -f " first to get the expanded spec,\n' + 'then save that output and run it locally.') + + executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir) + spec = executor.load_spec(spec_text) + success = executor.execute(spec) + + if not keep_work_dir and success: + logger.info('Cleaning up work directory: %s', work_dir) + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) + + return success diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index 78372b738..59050591c 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -54,3 +54,13 @@ osmo_py_test( requirement("truststore"), ] ) + +py_test( + name = "test_local_executor", + srcs = ["test_local_executor.py"], + deps = [ + "//src/utils:local_executor", + ], + local = True, + tags = ["manual"], +) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py new file mode 100644 index 000000000..0da9ba993 --- /dev/null +++ b/src/utils/tests/test_local_executor.py @@ -0,0 +1,1406 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import os +import shutil +import subprocess +import tempfile +import textwrap +import unittest +from unittest import mock + +from src.utils.job import task as task_module +from src.utils.local_executor import LocalExecutor, TaskNode, TaskResult, run_workflow_locally + + +# --------------------------------------------------------------------------- +# Helper: detect Docker availability once for the entire module +# --------------------------------------------------------------------------- +def _docker_available() -> bool: + try: + result = subprocess.run( + ['docker', 'info'], + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +DOCKER_AVAILABLE = _docker_available() +SKIP_DOCKER_MSG = 'Docker is not available on this machine' + + +# ============================================================================ +# Unit tests — no Docker required; exercise parsing, DAG, tokens, validation +# ============================================================================ +class TestLoadSpec(unittest.TestCase): + """Verify that real OSMO YAML specs are parsed correctly via the existing Pydantic models.""" + + def test_single_task_spec(self): + spec_text = textwrap.dedent('''\ + workflow: + name: hello-osmo + tasks: + - name: hello + image: ubuntu:24.04 + command: ["echo"] + args: ["Hello from OSMO!"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'hello-osmo') + self.assertEqual(len(spec.tasks), 1) + self.assertEqual(spec.tasks[0].name, 'hello') + self.assertEqual(spec.tasks[0].image, 'ubuntu:24.04') + + def test_serial_tasks_spec(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial-tasks + tasks: + - name: task1 + image: ubuntu:22.04 + command: [sh] + args: [/tmp/run.sh] + files: + - contents: | + echo "Hello from task1" + echo "data" > {{output}}/test.txt + path: /tmp/run.sh + - name: task2 + image: ubuntu:22.04 + command: [sh] + args: [/tmp/run.sh] + files: + - contents: | + cat {{input:0}}/test.txt + path: /tmp/run.sh + inputs: + - task: task1 + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'serial-tasks') + self.assertEqual(len(spec.tasks), 2) + first_input = spec.tasks[1].inputs[0] + self.assertIsInstance(first_input, task_module.TaskInputOutput) + if isinstance(first_input, task_module.TaskInputOutput): + self.assertEqual(first_input.task, 'task1') + + def test_groups_spec(self): + spec_text = textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: first-group + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo", "leader"] + - name: follower + image: ubuntu:24.04 + command: ["echo", "follower"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(len(spec.groups), 1) + self.assertEqual(len(spec.groups[0].tasks), 2) + self.assertTrue(spec.groups[0].tasks[0].lead) + + def test_versioned_spec(self): + spec_text = textwrap.dedent('''\ + version: 2 + workflow: + name: versioned + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'versioned') + + def test_invalid_version_rejected(self): + spec_text = textwrap.dedent('''\ + version: 99 + workflow: + name: bad-version + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + with self.assertRaises(Exception): + executor.load_spec(spec_text) + + def test_both_tasks_and_groups_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: invalid + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + groups: + - name: g + tasks: + - name: t2 + image: alpine:3.18 + command: ["echo"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + with self.assertRaises(Exception): + executor.load_spec(spec_text) + + def test_empty_workflow_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: empty + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + with self.assertRaises(Exception): + executor.load_spec(spec_text) + + def test_resources_spec_parsed(self): + spec_text = textwrap.dedent('''\ + workflow: + name: with-resources + resources: + default: + cpu: 2 + memory: 4Gi + storage: 10Gi + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.resources['default'].cpu, 2) + self.assertEqual(spec.resources['default'].memory, '4Gi') + + def test_environment_parsed(self): + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: task + image: alpine:3.18 + command: ["printenv"] + environment: + MY_VAR: hello + ANOTHER: world + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.tasks[0].environment['MY_VAR'], 'hello') + self.assertEqual(spec.tasks[0].environment['ANOTHER'], 'world') + + +class TestBuildDag(unittest.TestCase): + """Verify DAG construction from task dependencies.""" + + def _make_executor(self) -> LocalExecutor: + return LocalExecutor(work_dir='/tmp/unused') + + def test_no_dependencies(self): + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo", "a"] + - name: b + image: alpine:3.18 + command: ["echo", "b"] + - name: c + image: alpine:3.18 + command: ["echo", "c"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(len(executor._task_nodes), 3) + for node in executor._task_nodes.values(): + self.assertEqual(len(node.upstream), 0) + self.assertEqual(len(node.downstream), 0) + + def test_serial_chain(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + - name: third + image: alpine:3.18 + command: ["echo"] + inputs: + - task: second + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['first'].upstream, set()) + self.assertEqual(executor._task_nodes['first'].downstream, {'second'}) + self.assertEqual(executor._task_nodes['second'].upstream, {'first'}) + self.assertEqual(executor._task_nodes['second'].downstream, {'third'}) + self.assertEqual(executor._task_nodes['third'].upstream, {'second'}) + self.assertEqual(executor._task_nodes['third'].downstream, set()) + + def test_diamond_dependency(self): + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['root'].downstream, {'left', 'right'}) + self.assertEqual(executor._task_nodes['join'].upstream, {'left', 'right'}) + + def test_unknown_dependency_raises(self): + spec_text = textwrap.dedent('''\ + workflow: + name: broken + tasks: + - name: task1 + image: alpine:3.18 + command: ["echo"] + inputs: + - task: nonexistent + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + self.assertIn('nonexistent', str(context.exception)) + + def test_groups_with_cross_group_deps(self): + spec_text = textwrap.dedent('''\ + workflow: + name: cross-group + groups: + - name: fetch + tasks: + - name: download + lead: true + image: alpine:3.18 + command: ["echo"] + - name: process + tasks: + - name: transform + lead: true + image: alpine:3.18 + command: ["echo"] + inputs: + - task: download + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['download'].downstream, {'transform'}) + self.assertEqual(executor._task_nodes['transform'].upstream, {'download'}) + + +class TestFindReadyTasks(unittest.TestCase): + """Verify correct identification of tasks ready to execute.""" + + def test_all_root_tasks_ready(self): + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + ready = executor._find_ready_tasks() + self.assertEqual(set(ready), {'a', 'b'}) + + def test_dependent_not_ready_until_upstream_completes(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + ready = executor._find_ready_tasks() + self.assertEqual(ready, ['first']) + + executor._results['first'] = TaskResult(name='first', exit_code=0, output_dir='/tmp/out') + ready = executor._find_ready_tasks() + self.assertEqual(ready, ['second']) + + def test_failed_upstream_blocks_downstream(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['first'] = TaskResult(name='first', exit_code=1, output_dir='/tmp/out') + ready = executor._find_ready_tasks() + self.assertEqual(ready, []) + + +class TestCancelDownstream(unittest.TestCase): + + def test_cascading_cancel(self): + spec_text = textwrap.dedent('''\ + workflow: + name: chain + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['a'] = TaskResult(name='a', exit_code=1, output_dir='/tmp') + executor._cancel_downstream('a') + + self.assertIn('b', executor._results) + self.assertIn('c', executor._results) + self.assertEqual(executor._results['b'].exit_code, -1) + self.assertEqual(executor._results['c'].exit_code, -1) + + +class TestSubstituteTokens(unittest.TestCase): + + def test_output_token(self): + executor = LocalExecutor(work_dir='/tmp/unused') + tokens = {'output': '/work/task1/output'} + result = executor._substitute_tokens('echo data > {{output}}/file.txt', tokens) + self.assertEqual(result, 'echo data > /work/task1/output/file.txt') + + def test_input_by_index(self): + executor = LocalExecutor(work_dir='/tmp/unused') + tokens = {'input:0': '/work/upstream/output'} + result = executor._substitute_tokens('cat {{input:0}}/data.csv', tokens) + self.assertEqual(result, 'cat /work/upstream/output/data.csv') + + def test_input_by_name(self): + executor = LocalExecutor(work_dir='/tmp/unused') + tokens = {'input:task1': '/work/task1/output'} + result = executor._substitute_tokens('cat {{ input:task1 }}/data.csv', tokens) + self.assertEqual(result, 'cat /work/task1/output/data.csv') + + def test_whitespace_around_tokens(self): + executor = LocalExecutor(work_dir='/tmp/unused') + tokens = {'output': '/out'} + result = executor._substitute_tokens('{{ output }}/file.txt', tokens) + self.assertEqual(result, '/out/file.txt') + + def test_multiple_tokens_in_one_string(self): + executor = LocalExecutor(work_dir='/tmp/unused') + tokens = {'output': '/out', 'input:0': '/in0'} + result = executor._substitute_tokens('cp {{input:0}}/src {{output}}/dst', tokens) + self.assertEqual(result, 'cp /in0/src /out/dst') + + def test_no_tokens_unchanged(self): + executor = LocalExecutor(work_dir='/tmp/unused') + result = executor._substitute_tokens('plain text no tokens', {}) + self.assertEqual(result, 'plain text no tokens') + + +class TestBuildTokenMap(unittest.TestCase): + + def test_output_only(self): + spec_text = textwrap.dedent('''\ + workflow: + name: simple + tasks: + - name: task1 + image: alpine:3.18 + command: ["echo"] + ''') + executor = LocalExecutor(work_dir='/tmp/work') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + node = executor._task_nodes['task1'] + tokens = executor._build_token_map(node, '/tmp/work/task1/output') + self.assertEqual(tokens['output'], '/tmp/work/task1/output') + self.assertEqual(len(tokens), 1) + + def test_with_upstream_inputs(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = LocalExecutor(work_dir='/tmp/work') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['producer'] = TaskResult( + name='producer', exit_code=0, output_dir='/tmp/work/producer/output') + + node = executor._task_nodes['consumer'] + tokens = executor._build_token_map(node, '/tmp/work/consumer/output') + + self.assertEqual(tokens['output'], '/tmp/work/consumer/output') + self.assertEqual(tokens['input:0'], '/tmp/work/producer/output') + self.assertEqual(tokens['input:producer'], '/tmp/work/producer/output') + + +class TestValidateForLocal(unittest.TestCase): + """Verify that unsupported features are detected and rejected.""" + + def _make_executor(self) -> LocalExecutor: + return LocalExecutor(work_dir='/tmp/unused') + + def test_simple_spec_passes(self): + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_local(spec) + + def test_dataset_input_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + self.assertIn('dataset', str(context.exception)) + + def test_url_input_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - url: s3://my-bucket/data/ + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + self.assertIn('URL', str(context.exception)) + + def test_dataset_output_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + outputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + self.assertIn('dataset', str(context.exception).lower()) + + def test_url_output_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + outputs: + - url: s3://my-bucket/models/ + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + self.assertIn('object storage', str(context.exception).lower()) + + def test_multiple_unsupported_features_all_reported(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task1 + image: ubuntu:24.04 + command: ["echo"] + inputs: + - url: s3://bucket/data/ + - name: task2 + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: ds + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + error_message = str(context.exception) + self.assertIn('task1', error_message) + self.assertIn('task2', error_message) + + def test_task_deps_only_passes(self): + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_local(spec) + + def test_files_and_env_pass(self): + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + environment: + MY_VAR: hello + files: + - contents: echo hi + path: /tmp/run.sh + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_local(spec) + + +class TestJinjaTemplateDetection(unittest.TestCase): + + def _write_temp_spec(self, content: str) -> str: + f = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) + f.write(content) + f.flush() + f.close() + return f.name + + def test_jinja_block_detected(self): + path = self._write_temp_spec(textwrap.dedent('''\ + workflow: + name: {%% if true %%}test{%% endif %%} + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''')) + try: + with self.assertRaises(ValueError) as context: + run_workflow_locally(path) + self.assertIn('Jinja', str(context.exception)) + finally: + os.unlink(path) + + def test_jinja_comment_detected(self): + path = self._write_temp_spec(textwrap.dedent('''\ + {# A comment #} + workflow: + name: test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''')) + try: + with self.assertRaises(ValueError) as context: + run_workflow_locally(path) + self.assertIn('Jinja', str(context.exception)) + finally: + os.unlink(path) + + def test_default_values_section_detected(self): + path = self._write_temp_spec(textwrap.dedent('''\ + workflow: + name: "{{experiment_name}}" + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + default-values: + experiment_name: my-experiment + ''')) + try: + with self.assertRaises(ValueError) as context: + run_workflow_locally(path) + self.assertIn('Jinja', str(context.exception)) + finally: + os.unlink(path) + + +# ============================================================================ +# Integration tests — require Docker; test actual container execution +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestDockerExecution(unittest.TestCase): + """ + Integration tests that run real OSMO workflow specs through the local executor + using Docker. Each test uses a spec that would normally run on a Kubernetes cluster. + """ + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-test-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _execute_spec(self, spec_text: str) -> bool: + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + return executor.execute(spec) + + # ---- Single task tests ---- + + def test_hello_world(self): + spec_text = textwrap.dedent('''\ + workflow: + name: hello-osmo + tasks: + - name: hello + image: alpine:3.18 + command: ["echo", "Hello from OSMO!"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_single_task_with_args(self): + spec_text = textwrap.dedent('''\ + workflow: + name: args-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + args: ["argument1", "argument2"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_task_failure_returns_false(self): + spec_text = textwrap.dedent('''\ + workflow: + name: will-fail + tasks: + - name: failing-task + image: alpine:3.18 + command: ["sh", "-c", "exit 42"] + ''') + self.assertFalse(self._execute_spec(spec_text)) + + # ---- Environment variable tests ---- + + def test_environment_variables(self): + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: check-env + image: alpine:3.18 + command: ["sh", "-c"] + args: ["test \\"$MY_VAR\\" = \\"hello_world\\" && test \\"$SECOND\\" = \\"42\\""] + environment: + MY_VAR: hello_world + SECOND: "42" + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Files mount tests ---- + + def test_inline_file_mounted(self): + spec_text = textwrap.dedent('''\ + workflow: + name: files-test + tasks: + - name: check-file + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "script ran successfully" + path: /tmp/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_multiple_files_mounted(self): + spec_text = textwrap.dedent('''\ + workflow: + name: multi-files + tasks: + - name: check-files + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat /tmp/config.txt && sh /scripts/run.sh"] + files: + - contents: "key=value" + path: /tmp/config.txt + - contents: | + echo "second script ok" + path: /scripts/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Data output tests ---- + + def test_output_directory_writable(self): + spec_text = textwrap.dedent('''\ + workflow: + name: output-test + tasks: + - name: write-output + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'payload' > {{output}}/result.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + output_file = os.path.join(self.work_dir, 'write-output', 'output', 'result.txt') + self.assertTrue(os.path.exists(output_file)) + with open(output_file) as f: + self.assertEqual(f.read().strip(), 'payload') + + # ---- Serial data flow tests ---- + + def test_serial_data_flow_two_tasks(self): + spec_text = textwrap.dedent('''\ + workflow: + name: serial-data + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'from_producer' > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + self.assertTrue(os.path.exists(received)) + with open(received) as f: + self.assertEqual(f.read().strip(), 'from_producer') + + def test_serial_chain_three_tasks(self): + """Mimics cookbook/tutorials/serial_workflow.yaml""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-chain + tasks: + - name: task1 + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'task1_data' > {{output}}/result.txt"] + + - name: task2 + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + cat {{input:0}}/result.txt > {{output}}/result.txt + echo '_plus_task2' >> {{output}}/result.txt + inputs: + - task: task1 + + - name: task3 + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + cat {{input:0}}/result.txt > {{output}}/final.txt + cat {{input:1}}/result.txt >> {{output}}/final.txt + inputs: + - task: task1 + - task: task2 + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'task3', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('task1_data', content) + self.assertIn('_plus_task2', content) + + # ---- Parallel execution tests ---- + + def test_parallel_independent_tasks(self): + """Mimics cookbook/tutorials/parallel_tasks.yaml""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel-tasks + tasks: + - name: task-a + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'a' > {{output}}/marker.txt"] + - name: task-b + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'b' > {{output}}/marker.txt"] + - name: task-c + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'c' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + for task_name, expected in [('task-a', 'a'), ('task-b', 'b'), ('task-c', 'c')]: + marker = os.path.join(self.work_dir, task_name, 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), expected) + + # ---- Diamond DAG tests ---- + + def test_diamond_dag(self): + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'root_data' > {{output}}/base.txt"] + - name: left + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'left:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'right:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/result.txt > {{output}}/final.txt && cat {{input:1}}/result.txt >> {{output}}/final.txt"] + inputs: + - task: left + - task: right + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'join', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('left:', content) + self.assertIn('right:', content) + self.assertIn('root_data', content) + + # ---- Failure propagation tests ---- + + def test_failure_cancels_downstream(self): + spec_text = textwrap.dedent('''\ + workflow: + name: fail-chain + tasks: + - name: failing + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: should-not-run + image: alpine:3.18 + command: ["sh", "-c", "echo 'oops' > {{output}}/should_not_exist.txt"] + inputs: + - task: failing + ''') + self.assertFalse(self._execute_spec(spec_text)) + output_file = os.path.join(self.work_dir, 'should-not-run', 'output', 'should_not_exist.txt') + self.assertFalse(os.path.exists(output_file)) + + def test_parallel_failure_does_not_affect_independent_branch(self): + spec_text = textwrap.dedent('''\ + workflow: + name: partial-fail + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo ok > {{output}}/data.txt"] + - name: fail-branch + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + inputs: + - task: root + - name: ok-branch + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: root + ''') + result = self._execute_spec(spec_text) + # The executor should stop on first failure, so the overall result is False. + # root succeeds, then one of the branches fails. + self.assertFalse(result) + + # ---- Groups (ganged tasks) tests ---- + + def test_group_with_single_task(self): + spec_text = textwrap.dedent('''\ + workflow: + name: single-group + groups: + - name: my-group + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'group_ok' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + marker = os.path.join(self.work_dir, 'leader', 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), 'group_ok') + + def test_groups_with_data_flow(self): + """Mimics cookbook/tutorials/combination_workflow_simple.yaml structure.""" + spec_text = textwrap.dedent('''\ + workflow: + name: data-pipeline + groups: + - name: prepare-data + tasks: + - name: generate-dataset + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + mkdir -p {{output}}/data + for i in 1 2 3; do echo "sample_$i" >> {{output}}/data/dataset.csv; done + - name: train-models + tasks: + - name: train-model + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + wc -l {{input:0}}/data/dataset.csv > {{output}}/line_count.txt + inputs: + - task: generate-dataset + ''') + self.assertTrue(self._execute_spec(spec_text)) + line_count_file = os.path.join(self.work_dir, 'train-model', 'output', 'line_count.txt') + with open(line_count_file) as f: + content = f.read() + self.assertIn('3', content) + + # ---- Input by task name tests ---- + + def test_input_by_task_name(self): + spec_text = textwrap.dedent('''\ + workflow: + name: named-input + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'named_data' > {{output}}/out.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:producer}}/out.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'named_data') + + # ---- Files with token substitution ---- + + def test_file_contents_with_token_substitution(self): + """Mimics cookbook/tutorials/serial_workflow.yaml pattern of inline scripts with tokens.""" + spec_text = textwrap.dedent('''\ + workflow: + name: file-tokens + tasks: + - name: writer + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "writing output" + echo "file_data" > {{output}}/result.txt + path: /tmp/run.sh + - name: reader + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + cat {{input:0}}/result.txt > {{output}}/received.txt + path: /tmp/run.sh + inputs: + - task: writer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'reader', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'file_data') + + # ---- Resource spec ignored gracefully ---- + + def test_resources_ignored_gracefully(self): + """Resource specs are K8s-specific; local executor should accept and ignore them.""" + spec_text = textwrap.dedent('''\ + workflow: + name: with-resources + resources: + default: + cpu: 2 + memory: 4Gi + storage: 10Gi + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Docker-not-found handling ---- + + def test_docker_not_found_graceful_failure(self): + spec_text = textwrap.dedent('''\ + workflow: + name: no-docker + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor( + work_dir=self.work_dir, + keep_work_dir=True, + docker_cmd='nonexistent-docker-binary-12345', + ) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + # ---- Alternative container runtime ---- + + def test_custom_docker_command(self): + spec_text = textwrap.dedent('''\ + workflow: + name: custom-cmd + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor( + work_dir=self.work_dir, + keep_work_dir=True, + docker_cmd='docker', + ) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + +# ============================================================================ +# Integration tests using actual cookbook spec files from the repo +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestCookbookSpecs(unittest.TestCase): + """ + Run real OSMO cookbook YAML specs that are designed for Kubernetes clusters, + and verify they execute successfully in the local Docker executor. + """ + + COOKBOOK_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', + 'cookbook', 'tutorials') + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-cookbook-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _run_cookbook_spec(self, filename: str) -> bool: + spec_path = os.path.join(self.COOKBOOK_DIR, filename) + if not os.path.exists(spec_path): + self.skipTest(f'Cookbook file not found: {spec_path}') + return run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + + def test_hello_world_yaml(self): + self.assertTrue(self._run_cookbook_spec('hello_world.yaml')) + + def test_parallel_tasks_yaml(self): + self.assertTrue(self._run_cookbook_spec('parallel_tasks.yaml')) + + def test_serial_workflow_yaml(self): + self.assertTrue(self._run_cookbook_spec('serial_workflow.yaml')) + + def test_resources_basic_yaml(self): + self.assertTrue(self._run_cookbook_spec('resources_basic.yaml')) + + def test_combination_workflow_simple_yaml(self): + """ + The combination_workflow_simple.yaml has a 'sleep 120' in transform-a. + We skip it here because it would take 2+ minutes; a trimmed version + of the same structure is tested in TestDockerExecution.test_groups_with_data_flow. + """ + self.skipTest('Contains sleep 120; covered by test_groups_with_data_flow') + + def test_unsupported_spec_data_download(self): + """data_download.yaml uses URL inputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_download.yaml') + self.assertIn('URL', str(context.exception)) + + def test_unsupported_spec_data_upload(self): + """data_upload.yaml uses URL outputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_upload.yaml') + self.assertIn('object storage', str(context.exception).lower()) + + def test_unsupported_spec_dataset_upload(self): + """dataset_upload.yaml uses dataset outputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('dataset_upload.yaml') + self.assertIn('dataset', str(context.exception).lower()) + + def test_unsupported_spec_template(self): + """template_hello_world.yaml uses default-values templating — verify it is rejected.""" + spec_path = os.path.join(self.COOKBOOK_DIR, 'template_hello_world.yaml') + if not os.path.exists(spec_path): + self.skipTest('Cookbook file not found') + with self.assertRaises(ValueError) as context: + run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertIn('Jinja', str(context.exception)) + + +# ============================================================================ +# run_workflow_locally() integration tests +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestRunWorkflowLocally(unittest.TestCase): + """Test the top-level run_workflow_locally() convenience function.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-func-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_success_cleans_up_when_not_keeping(self): + work_dir = tempfile.mkdtemp(prefix='osmo-local-cleanup-') + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: cleanup-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_locally( + spec_path=spec_path, + work_dir=work_dir, + keep_work_dir=False, + ) + self.assertTrue(result) + self.assertFalse(os.path.exists(work_dir)) + finally: + os.unlink(spec_path) + if os.path.exists(work_dir): + shutil.rmtree(work_dir, ignore_errors=True) + + def test_failure_preserves_work_dir(self): + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: fail-test + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''')) + spec_path = f.name + try: + result = run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=False, + ) + self.assertFalse(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + def test_keep_flag_preserves_on_success(self): + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: keep-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertTrue(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + def test_nonexistent_file_raises(self): + with self.assertRaises(FileNotFoundError): + run_workflow_locally(spec_path='/nonexistent/path/spec.yaml') + + +if __name__ == '__main__': + unittest.main() From 668c0a941246fe46b6620daa0b292329a9f52da3 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 2 Apr 2026 17:08:17 -0700 Subject: [PATCH 02/35] Add local.py and update dependencies in BUILD file --- src/cli/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cli/BUILD b/src/cli/BUILD index 7a9b905ee..cdada591a 100755 --- a/src/cli/BUILD +++ b/src/cli/BUILD @@ -37,6 +37,7 @@ osmo_py_library( "dataset.py", "editor.py", "formatters.py", + "local.py", "login.py", "main_parser.py", "pool.py", @@ -73,6 +74,7 @@ osmo_py_library( "//src/lib/utils:validation", "//src/lib/utils:version", "//src/lib/utils:workflow", + "//src/utils:local_executor", ], ) From 63b062016d189286672f3695edb6bc0e72b4dfd3 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 14:37:32 -0700 Subject: [PATCH 03/35] Add GPU passthrough support in LocalExecutor - Implemented GPU resource handling in LocalExecutor to allow tasks to request GPU resources. - Added a new method to determine the GPU count for tasks based on their resource specifications. - Updated Docker run command to include GPU options when applicable. --- src/utils/local_executor.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index c1926db34..c7f0939ca 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -60,6 +60,7 @@ class LocalExecutor: - Inline `files:` written to the container - `environment:` passed as Docker env vars - Task-to-task data flow via shared local directories + - GPU passthrough via --gpus for tasks that declare gpu > 0 in resources Does NOT support (raises clear errors): - Dataset / URL inputs/outputs (require object storage) @@ -204,6 +205,13 @@ def _cancel_downstream(self, failed_task: str): name=downstream, exit_code=-1, output_dir='') queue.append(downstream) + def _task_gpu_count(self, task_spec: task_module.TaskSpec, + spec: workflow_module.WorkflowSpec) -> int: + resource_spec = spec.resources.get(task_spec.resource) + if resource_spec and resource_spec.gpu: + return resource_spec.gpu + return 0 + def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskResult: task_spec = node.spec task_dir = os.path.join(self._work_dir, node.name) @@ -225,6 +233,11 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args = [self._docker_cmd, 'run', '--rm'] + gpu_count = self._task_gpu_count(task_spec, spec) + if gpu_count > 0: + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] + logger.info('Task "%s" requesting %d GPU(s)', node.name, gpu_count) + for key, value in task_spec.environment.items(): resolved_value = self._substitute_tokens(value, token_map) docker_args += ['-e', f'{key}={resolved_value}'] From b20f2b9650d8ca5659e5348a3172e256d90555aa Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 14:59:51 -0700 Subject: [PATCH 04/35] Update Docker command construction in LocalExecutor - Adjusted the handling of the resolved_command to correctly set the entrypoint and append arguments. - Ensured that the first element of resolved_command is used as the entrypoint while the rest are appended to the Docker command. --- src/utils/local_executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index c7f0939ca..321a2bc9e 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -254,8 +254,10 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] + if resolved_command: + docker_args += ['--entrypoint', resolved_command[0]] docker_args.append(task_spec.image) - docker_args += resolved_command + resolved_args + docker_args += resolved_command[1:] + resolved_args logger.debug('Docker command: %s', ' '.join(docker_args)) From bcf4246ba80ba2de427536723fcfeccf6421cb38 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 16:25:26 -0700 Subject: [PATCH 05/35] Add resume functionality to local workflow execution - Introduced `--resume` and `--from-step` options in the CLI to allow resuming previous runs. - Implemented state management in `LocalExecutor` to save and restore task results. - Enhanced logging to provide feedback on skipped tasks and remaining tasks during resumption. - Added GPU detection improvements to handle scenarios where requested GPUs are unavailable. --- src/cli/local.py | 15 +++++ src/utils/local_executor.py | 128 ++++++++++++++++++++++++++++++++++-- 2 files changed, 136 insertions(+), 7 deletions(-) diff --git a/src/cli/local.py b/src/cli/local.py index a9f12eaaf..b1d3be72e 100644 --- a/src/cli/local.py +++ b/src/cli/local.py @@ -54,6 +54,19 @@ def setup_parser(parser: argparse._SubParsersAction): dest='docker_cmd', default='docker', help='Docker-compatible command to use (e.g. podman). Default: docker.') + run_parser.add_argument( + '--resume', + action='store_true', + default=False, + help='Resume a previous run, skipping tasks that already completed successfully. ' + 'Requires --work-dir pointing to the previous run directory.') + run_parser.add_argument( + '--from-step', + dest='from_step', + default=None, + help='Resume from a specific task, re-running it and all downstream tasks. ' + 'Tasks upstream of the specified step are skipped if they completed ' + 'successfully. Requires --work-dir pointing to the previous run directory.') run_parser.set_defaults(func=_run_local) @@ -63,6 +76,8 @@ def _run_local(service_client, args: argparse.Namespace): spec_path=args.workflow_file, work_dir=args.work_dir, keep_work_dir=args.keep, + resume=args.resume, + from_step=args.from_step, ) except ValueError as error: print(f'Error: {error}', file=sys.stderr) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 321a2bc9e..52da92d98 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -17,6 +17,7 @@ """ import dataclasses +import json import logging import os import re @@ -33,6 +34,8 @@ logger = logging.getLogger(__name__) +STATE_FILE_NAME = '.osmo-state.json' + @dataclasses.dataclass class TaskNode: @@ -74,19 +77,53 @@ def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = self._docker_cmd = docker_cmd self._task_nodes: Dict[str, TaskNode] = {} self._results: Dict[str, TaskResult] = {} + self._available_gpus: int | None = None + + def _detect_available_gpus(self) -> int: + if self._available_gpus is not None: + return self._available_gpus + try: + result = subprocess.run( + ['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + gpu_indices = [line.strip() for line in result.stdout.strip().splitlines() if line.strip()] + self._available_gpus = len(gpu_indices) + else: + logger.warning('nvidia-smi failed (exit %d) — assuming 0 GPUs available', result.returncode) + self._available_gpus = 0 + except FileNotFoundError: + logger.warning('nvidia-smi not found — assuming 0 GPUs available') + self._available_gpus = 0 + except subprocess.TimeoutExpired: + logger.warning('nvidia-smi timed out — assuming 0 GPUs available') + self._available_gpus = 0 + return self._available_gpus def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: raw = yaml.safe_load(spec_text) versioned = workflow_module.VersionedWorkflowSpec(**raw) return versioned.workflow - def execute(self, spec: workflow_module.WorkflowSpec) -> bool: + def execute(self, spec: workflow_module.WorkflowSpec, + resume: bool = False, from_step: str | None = None) -> bool: self._build_dag(spec) self._validate_for_local(spec) self._setup_directories() - logger.info('Workflow "%s": %d task(s) across %d group(s)', - spec.name, sum(len(g.tasks) for g in self._groups(spec)), len(self._groups(spec))) + if resume or from_step: + self._restore_completed_tasks(from_step) + + total_tasks = sum(len(g.tasks) for g in self._groups(spec)) + skipped = len(self._results) + remaining = total_tasks - skipped + if skipped > 0: + logger.info('Workflow "%s": resuming — %d task(s) skipped, %d remaining', + spec.name, skipped, remaining) + else: + logger.info('Workflow "%s": %d task(s) across %d group(s)', + spec.name, total_tasks, len(self._groups(spec))) ready = self._find_ready_tasks() while ready: @@ -95,6 +132,7 @@ def execute(self, spec: workflow_module.WorkflowSpec) -> bool: logger.info('--- Running task: %s (image: %s) ---', task_name, node.spec.image) result = self._run_task(node, spec) self._results[task_name] = result + self._save_state() if result.exit_code != 0: logger.error('Task "%s" failed with exit code %d', task_name, result.exit_code) @@ -113,6 +151,64 @@ def execute(self, spec: workflow_module.WorkflowSpec) -> bool: logger.info('Workflow "%s" completed successfully', spec.name) return True + @property + def _state_file_path(self) -> str: + return os.path.join(self._work_dir, STATE_FILE_NAME) + + def _save_state(self): + state = { + 'tasks': { + name: {'exit_code': result.exit_code, 'output_dir': result.output_dir} + for name, result in self._results.items() + if result.exit_code != -1 + } + } + with open(self._state_file_path, 'w') as f: + json.dump(state, f, indent=2) + + def _load_state(self) -> Dict | None: + if not os.path.exists(self._state_file_path): + return None + with open(self._state_file_path) as f: + return json.load(f) + + def _restore_completed_tasks(self, from_step: str | None = None): + state = self._load_state() + if state is None: + logger.info('No previous state found — starting from scratch') + return + + completed: Dict[str, Dict] = {} + for name, info in state.get('tasks', {}).items(): + if name not in self._task_nodes: + continue + if info['exit_code'] == 0 and os.path.isdir(info['output_dir']): + completed[name] = info + + if from_step: + if from_step not in self._task_nodes: + raise ValueError(f'Task "{from_step}" not found in workflow') + to_invalidate = self._get_downstream_tasks(from_step) + to_invalidate.add(from_step) + for name in to_invalidate: + completed.pop(name, None) + + for name, info in completed.items(): + self._results[name] = TaskResult( + name=name, exit_code=0, output_dir=info['output_dir']) + logger.info('Resuming: skipping completed task "%s"', name) + + def _get_downstream_tasks(self, task_name: str) -> Set[str]: + visited: Set[str] = set() + queue = [task_name] + while queue: + current = queue.pop(0) + for downstream in self._task_nodes[current].downstream: + if downstream not in visited: + visited.add(downstream) + queue.append(downstream) + return visited + def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGroupSpec]: if spec.groups: return spec.groups @@ -235,8 +331,19 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR gpu_count = self._task_gpu_count(task_spec, spec) if gpu_count > 0: - docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] - logger.info('Task "%s" requesting %d GPU(s)', node.name, gpu_count) + available = self._detect_available_gpus() + if available == 0: + logger.warning( + 'Task "%s" requests %d GPU(s) but no GPUs are available — running without GPU support', + node.name, gpu_count) + elif gpu_count > available: + logger.warning( + 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', + node.name, gpu_count, available, available) + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(available))}"'] + else: + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] + logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) for key, value in task_spec.environment.items(): resolved_value = self._substitute_tokens(value, token_map) @@ -286,7 +393,13 @@ def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: def run_workflow_locally(spec_path: str, work_dir: str | None = None, - keep_work_dir: bool = False) -> bool: + keep_work_dir: bool = False, + resume: bool = False, + from_step: str | None = None) -> bool: + if (resume or from_step) and work_dir is None: + raise ValueError( + '--resume and --from-step require --work-dir pointing to a previous run directory.') + if work_dir is None: work_dir = tempfile.mkdtemp(prefix='osmo-local-') logger.info('Using temporary work directory: %s', work_dir) @@ -303,7 +416,8 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir) spec = executor.load_spec(spec_text) - success = executor.execute(spec) + success = executor.execute(spec, resume=resume or from_step is not None, + from_step=from_step) if not keep_work_dir and success: logger.info('Cleaning up work directory: %s', work_dir) From 3e4f7c3a5afd53219ac8df227067b172b0aaa4ff Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 16:25:44 -0700 Subject: [PATCH 06/35] Update .gitignore to include .venv directory --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 2fe57a694..b84388d41 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,5 @@ docs/**/domain_config.js .ruff_cache .lycheecache + +.venv/ \ No newline at end of file From 1aa8c307e88f3c06b42a0d729b737428fff0ea2e Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 16:42:15 -0700 Subject: [PATCH 07/35] Enhance local workflow execution with Docker command support - Added `docker_cmd` parameter to `run_workflow_locally` for customizable Docker command execution. - Improved logging to redact sensitive information in Docker command arguments. - Implemented error handling for unexecuted tasks in `LocalExecutor` to detect potential workflow stalls. --- src/cli/local.py | 1 + src/utils/local_executor.py | 26 +++++++++++++++++++++++--- src/utils/tests/BUILD | 1 - 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/cli/local.py b/src/cli/local.py index b1d3be72e..a481c4c56 100644 --- a/src/cli/local.py +++ b/src/cli/local.py @@ -78,6 +78,7 @@ def _run_local(service_client, args: argparse.Namespace): keep_work_dir=args.keep, resume=args.resume, from_step=args.from_step, + docker_cmd=args.docker_cmd, ) except ValueError as error: print(f'Error: {error}', file=sys.stderr) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 52da92d98..8c57861e1 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -143,6 +143,12 @@ def execute(self, spec: workflow_module.WorkflowSpec, ready = self._find_ready_tasks() + unexecuted = set(self._task_nodes.keys()) - set(self._results.keys()) + if unexecuted: + logger.error('Workflow "%s" stalled — tasks could not be scheduled (possible cycle): %s', + spec.name, ', '.join(sorted(unexecuted))) + return False + failed = [name for name, r in self._results.items() if r.exit_code != 0] if failed: logger.error('Workflow failed. Failed tasks: %s', ', '.join(failed)) @@ -366,7 +372,19 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args.append(task_spec.image) docker_args += resolved_command[1:] + resolved_args - logger.debug('Docker command: %s', ' '.join(docker_args)) + if logger.isEnabledFor(logging.DEBUG): + redacted_args = [] + skip_next = False + for arg in docker_args: + if skip_next: + redacted_args.append(arg.split('=', 1)[0] + '=REDACTED') + skip_next = False + elif arg == '-e': + redacted_args.append(arg) + skip_next = True + else: + redacted_args.append(arg) + logger.debug('Docker command: %s', ' '.join(redacted_args)) try: process = subprocess.run(docker_args, capture_output=False) @@ -395,7 +413,8 @@ def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: def run_workflow_locally(spec_path: str, work_dir: str | None = None, keep_work_dir: bool = False, resume: bool = False, - from_step: str | None = None) -> bool: + from_step: str | None = None, + docker_cmd: str = 'docker') -> bool: if (resume or from_step) and work_dir is None: raise ValueError( '--resume and --from-step require --work-dir pointing to a previous run directory.') @@ -414,7 +433,8 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, 'Run "osmo workflow submit --dry-run -f " first to get the expanded spec,\n' 'then save that output and run it locally.') - executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir) + executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd=docker_cmd) spec = executor.load_spec(spec_text) success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index 59050591c..a9369af05 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -62,5 +62,4 @@ py_test( "//src/utils:local_executor", ], local = True, - tags = ["manual"], ) From d332f6caedd4d4e79d04186cdb5abddc41f29aa5 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 17:14:24 -0700 Subject: [PATCH 08/35] Enhance documentation and comments in local execution modules - Added detailed docstrings to functions and classes in `local.py` and `local_executor.py` to improve code readability and maintainability. - Updated test cases in `test_local_executor.py` with descriptive comments to clarify the purpose of each test. - Ensured consistency in documentation style across the codebase. --- src/cli/local.py | 2 + src/utils/local_executor.py | 24 ++++++++ src/utils/tests/test_local_executor.py | 78 +++++++++++++++++++++++++- 3 files changed, 103 insertions(+), 1 deletion(-) diff --git a/src/cli/local.py b/src/cli/local.py index a481c4c56..5bf596879 100644 --- a/src/cli/local.py +++ b/src/cli/local.py @@ -25,6 +25,7 @@ def setup_parser(parser: argparse._SubParsersAction): + """Register the 'local' subcommand and its nested 'run' action with the CLI argument parser.""" local_parser = parser.add_parser( 'local', help='Run workflows locally using Docker (no Kubernetes cluster required).') @@ -71,6 +72,7 @@ def setup_parser(parser: argparse._SubParsersAction): def _run_local(service_client, args: argparse.Namespace): + """Execute a workflow locally via Docker using the parsed CLI arguments.""" try: success = local_executor.run_workflow_locally( spec_path=args.workflow_file, diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 8c57861e1..4a84c3ec8 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -39,6 +39,8 @@ @dataclasses.dataclass class TaskNode: + """A node in the workflow DAG, linking a task spec to its upstream and downstream dependencies.""" + name: str spec: task_module.TaskSpec group: str @@ -48,6 +50,8 @@ class TaskNode: @dataclasses.dataclass class TaskResult: + """Outcome of a single task execution, capturing its exit code and output directory path.""" + name: str exit_code: int output_dir: str @@ -72,6 +76,7 @@ class LocalExecutor: """ def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker'): + """Initialize the executor with a work directory, cleanup preference, and container runtime command.""" self._work_dir = work_dir self._keep_work_dir = keep_work_dir self._docker_cmd = docker_cmd @@ -80,6 +85,7 @@ def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = self._available_gpus: int | None = None def _detect_available_gpus(self) -> int: + """Query nvidia-smi to count available GPUs, caching the result for subsequent calls.""" if self._available_gpus is not None: return self._available_gpus try: @@ -102,12 +108,14 @@ def _detect_available_gpus(self) -> int: return self._available_gpus def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: + """Parse raw YAML text into a validated WorkflowSpec via the versioned spec model.""" raw = yaml.safe_load(spec_text) versioned = workflow_module.VersionedWorkflowSpec(**raw) return versioned.workflow def execute(self, spec: workflow_module.WorkflowSpec, resume: bool = False, from_step: str | None = None) -> bool: + """Run all tasks in topological order, returning True if the entire workflow succeeds.""" self._build_dag(spec) self._validate_for_local(spec) self._setup_directories() @@ -159,9 +167,11 @@ def execute(self, spec: workflow_module.WorkflowSpec, @property def _state_file_path(self) -> str: + """Absolute path to the JSON state file used for resume tracking.""" return os.path.join(self._work_dir, STATE_FILE_NAME) def _save_state(self): + """Persist current task results to the state file so runs can be resumed later.""" state = { 'tasks': { name: {'exit_code': result.exit_code, 'output_dir': result.output_dir} @@ -173,12 +183,14 @@ def _save_state(self): json.dump(state, f, indent=2) def _load_state(self) -> Dict | None: + """Load previously saved task state from disk, returning None if no state file exists.""" if not os.path.exists(self._state_file_path): return None with open(self._state_file_path) as f: return json.load(f) def _restore_completed_tasks(self, from_step: str | None = None): + """Reload completed tasks from a previous run, optionally invalidating from a given step onward.""" state = self._load_state() if state is None: logger.info('No previous state found — starting from scratch') @@ -205,6 +217,7 @@ def _restore_completed_tasks(self, from_step: str | None = None): logger.info('Resuming: skipping completed task "%s"', name) def _get_downstream_tasks(self, task_name: str) -> Set[str]: + """Return all transitive downstream dependents of the given task via BFS.""" visited: Set[str] = set() queue = [task_name] while queue: @@ -216,11 +229,13 @@ def _get_downstream_tasks(self, task_name: str) -> Set[str]: return visited def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGroupSpec]: + """Return the spec's groups, or synthesize one group per task when groups are absent.""" if spec.groups: return spec.groups return [task_module.TaskGroupSpec(name=t.name, tasks=[t]) for t in spec.tasks] def _build_dag(self, spec: workflow_module.WorkflowSpec): + """Construct the internal DAG of TaskNodes from the workflow spec's tasks and input dependencies.""" self._task_nodes.clear() task_to_group: Dict[str, str] = {} @@ -245,6 +260,7 @@ def _build_dag(self, spec: workflow_module.WorkflowSpec): self._task_nodes[upstream_task].downstream.add(task_spec.name) def _validate_for_local(self, spec: workflow_module.WorkflowSpec): + """Raise ValueError if the spec uses features unsupported in local mode (datasets, URLs, credentials, etc.).""" unsupported_features = [] for group in self._groups(spec): for task_spec in group.tasks: @@ -279,11 +295,13 @@ def _validate_for_local(self, spec: workflow_module.WorkflowSpec): + '\n - '.join(unsupported_features)) def _setup_directories(self): + """Create the work directory and per-task output directories on the host filesystem.""" os.makedirs(self._work_dir, exist_ok=True) for task_name in self._task_nodes: os.makedirs(os.path.join(self._work_dir, task_name, 'output'), exist_ok=True) def _find_ready_tasks(self) -> List[str]: + """Return tasks whose upstream dependencies have all completed successfully.""" completed = set(self._results.keys()) ready = [] for name, node in self._task_nodes.items(): @@ -296,6 +314,7 @@ def _find_ready_tasks(self) -> List[str]: return ready def _cancel_downstream(self, failed_task: str): + """Mark all transitive downstream tasks of a failed task as cancelled (exit_code -1).""" visited: Set[str] = set() queue = [failed_task] while queue: @@ -309,12 +328,14 @@ def _cancel_downstream(self, failed_task: str): def _task_gpu_count(self, task_spec: task_module.TaskSpec, spec: workflow_module.WorkflowSpec) -> int: + """Return the number of GPUs requested by a task's resource spec, defaulting to 0.""" resource_spec = spec.resources.get(task_spec.resource) if resource_spec and resource_spec.gpu: return resource_spec.gpu return 0 def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskResult: + """Execute a single task as a Docker container, mounting inputs/outputs/files and returning the result.""" task_spec = node.spec task_dir = os.path.join(self._work_dir, node.name) output_dir = os.path.join(task_dir, 'output') @@ -394,6 +415,7 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR return TaskResult(name=node.name, exit_code=127, output_dir=output_dir) def _build_token_map(self, node: TaskNode, output_dir: str) -> Dict[str, str]: + """Build a mapping of {{token}} keys to host paths for output and each upstream input.""" tokens: Dict[str, str] = { 'output': output_dir, } @@ -405,6 +427,7 @@ def _build_token_map(self, node: TaskNode, output_dir: str) -> Dict[str, str]: return tokens def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: + """Replace all {{key}} placeholders in text with their corresponding token values.""" for key, value in tokens.items(): text = re.sub(r'\{\{\s*' + re.escape(key) + r'\s*\}\}', value, text) return text @@ -415,6 +438,7 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, resume: bool = False, from_step: str | None = None, docker_cmd: str = 'docker') -> bool: + """Load a workflow spec from disk and execute it locally via Docker, managing the work directory lifecycle.""" if (resume or from_step) and work_dir is None: raise ValueError( '--resume and --from-step require --work-dir pointing to a previous run directory.') diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 0da9ba993..673d1f2af 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -32,6 +32,7 @@ # Helper: detect Docker availability once for the entire module # --------------------------------------------------------------------------- def _docker_available() -> bool: + """Return True if the Docker daemon is reachable via 'docker info', False otherwise.""" try: result = subprocess.run( ['docker', 'info'], @@ -54,6 +55,7 @@ class TestLoadSpec(unittest.TestCase): """Verify that real OSMO YAML specs are parsed correctly via the existing Pydantic models.""" def test_single_task_spec(self): + """Parse a minimal single-task workflow and verify name, task count, and image.""" spec_text = textwrap.dedent('''\ workflow: name: hello-osmo @@ -71,6 +73,7 @@ def test_single_task_spec(self): self.assertEqual(spec.tasks[0].image, 'ubuntu:24.04') def test_serial_tasks_spec(self): + """Parse a two-task serial workflow and verify the task input dependency is resolved.""" spec_text = textwrap.dedent('''\ workflow: name: serial-tasks @@ -105,6 +108,7 @@ def test_serial_tasks_spec(self): self.assertEqual(first_input.task, 'task1') def test_groups_spec(self): + """Parse a grouped workflow and verify group structure and the lead task flag.""" spec_text = textwrap.dedent('''\ workflow: name: grouped @@ -126,6 +130,7 @@ def test_groups_spec(self): self.assertTrue(spec.groups[0].tasks[0].lead) def test_versioned_spec(self): + """Parse a spec with an explicit version field and verify it loads correctly.""" spec_text = textwrap.dedent('''\ version: 2 workflow: @@ -140,6 +145,7 @@ def test_versioned_spec(self): self.assertEqual(spec.name, 'versioned') def test_invalid_version_rejected(self): + """Reject a spec with an unsupported version number.""" spec_text = textwrap.dedent('''\ version: 99 workflow: @@ -154,6 +160,7 @@ def test_invalid_version_rejected(self): executor.load_spec(spec_text) def test_both_tasks_and_groups_rejected(self): + """Reject a spec that defines both top-level tasks and groups simultaneously.""" spec_text = textwrap.dedent('''\ workflow: name: invalid @@ -173,6 +180,7 @@ def test_both_tasks_and_groups_rejected(self): executor.load_spec(spec_text) def test_empty_workflow_rejected(self): + """Reject a spec with no tasks or groups defined.""" spec_text = textwrap.dedent('''\ workflow: name: empty @@ -182,6 +190,7 @@ def test_empty_workflow_rejected(self): executor.load_spec(spec_text) def test_resources_spec_parsed(self): + """Parse a spec with resource definitions and verify cpu/memory values.""" spec_text = textwrap.dedent('''\ workflow: name: with-resources @@ -201,6 +210,7 @@ def test_resources_spec_parsed(self): self.assertEqual(spec.resources['default'].memory, '4Gi') def test_environment_parsed(self): + """Parse a spec with environment variables and verify key-value pairs are preserved.""" spec_text = textwrap.dedent('''\ workflow: name: env-test @@ -222,9 +232,11 @@ class TestBuildDag(unittest.TestCase): """Verify DAG construction from task dependencies.""" def _make_executor(self) -> LocalExecutor: + """Create a LocalExecutor with a throwaway work directory for DAG-only tests.""" return LocalExecutor(work_dir='/tmp/unused') def test_no_dependencies(self): + """All tasks with no input dependencies have empty upstream and downstream sets.""" spec_text = textwrap.dedent('''\ workflow: name: parallel @@ -249,6 +261,7 @@ def test_no_dependencies(self): self.assertEqual(len(node.downstream), 0) def test_serial_chain(self): + """A three-task chain produces correct upstream/downstream links at each step.""" spec_text = textwrap.dedent('''\ workflow: name: serial @@ -279,6 +292,7 @@ def test_serial_chain(self): self.assertEqual(executor._task_nodes['third'].downstream, set()) def test_diamond_dependency(self): + """A diamond DAG (root -> left/right -> join) wires fan-out and fan-in edges correctly.""" spec_text = textwrap.dedent('''\ workflow: name: diamond @@ -311,6 +325,7 @@ def test_diamond_dependency(self): self.assertEqual(executor._task_nodes['join'].upstream, {'left', 'right'}) def test_unknown_dependency_raises(self): + """Referencing a non-existent upstream task raises ValueError.""" spec_text = textwrap.dedent('''\ workflow: name: broken @@ -328,6 +343,7 @@ def test_unknown_dependency_raises(self): self.assertIn('nonexistent', str(context.exception)) def test_groups_with_cross_group_deps(self): + """Dependencies between tasks in different groups are wired correctly.""" spec_text = textwrap.dedent('''\ workflow: name: cross-group @@ -359,6 +375,7 @@ class TestFindReadyTasks(unittest.TestCase): """Verify correct identification of tasks ready to execute.""" def test_all_root_tasks_ready(self): + """Tasks with no upstream dependencies are immediately ready.""" spec_text = textwrap.dedent('''\ workflow: name: parallel @@ -378,6 +395,7 @@ def test_all_root_tasks_ready(self): self.assertEqual(set(ready), {'a', 'b'}) def test_dependent_not_ready_until_upstream_completes(self): + """A downstream task only becomes ready after its upstream dependency completes.""" spec_text = textwrap.dedent('''\ workflow: name: serial @@ -403,6 +421,7 @@ def test_dependent_not_ready_until_upstream_completes(self): self.assertEqual(ready, ['second']) def test_failed_upstream_blocks_downstream(self): + """A failed upstream task prevents its downstream dependents from becoming ready.""" spec_text = textwrap.dedent('''\ workflow: name: serial @@ -426,8 +445,10 @@ def test_failed_upstream_blocks_downstream(self): class TestCancelDownstream(unittest.TestCase): + """Verify that downstream tasks are cancelled when an upstream task fails.""" def test_cascading_cancel(self): + """Cancellation of a failed task propagates to all transitive downstream dependents.""" spec_text = textwrap.dedent('''\ workflow: name: chain @@ -460,46 +481,55 @@ def test_cascading_cancel(self): class TestSubstituteTokens(unittest.TestCase): + """Verify {{token}} placeholder replacement in command strings and file contents.""" def test_output_token(self): + """The {{output}} token is replaced with the task output directory path.""" executor = LocalExecutor(work_dir='/tmp/unused') tokens = {'output': '/work/task1/output'} result = executor._substitute_tokens('echo data > {{output}}/file.txt', tokens) self.assertEqual(result, 'echo data > /work/task1/output/file.txt') def test_input_by_index(self): + """The {{input:N}} token is replaced with the Nth upstream output directory.""" executor = LocalExecutor(work_dir='/tmp/unused') tokens = {'input:0': '/work/upstream/output'} result = executor._substitute_tokens('cat {{input:0}}/data.csv', tokens) self.assertEqual(result, 'cat /work/upstream/output/data.csv') def test_input_by_name(self): + """The {{input:taskname}} token is replaced with the named task's output directory.""" executor = LocalExecutor(work_dir='/tmp/unused') tokens = {'input:task1': '/work/task1/output'} result = executor._substitute_tokens('cat {{ input:task1 }}/data.csv', tokens) self.assertEqual(result, 'cat /work/task1/output/data.csv') def test_whitespace_around_tokens(self): + """Whitespace inside {{ token }} braces is tolerated during substitution.""" executor = LocalExecutor(work_dir='/tmp/unused') tokens = {'output': '/out'} result = executor._substitute_tokens('{{ output }}/file.txt', tokens) self.assertEqual(result, '/out/file.txt') def test_multiple_tokens_in_one_string(self): + """Multiple distinct tokens in the same string are all replaced.""" executor = LocalExecutor(work_dir='/tmp/unused') tokens = {'output': '/out', 'input:0': '/in0'} result = executor._substitute_tokens('cp {{input:0}}/src {{output}}/dst', tokens) self.assertEqual(result, 'cp /in0/src /out/dst') def test_no_tokens_unchanged(self): + """Text without any token placeholders passes through unchanged.""" executor = LocalExecutor(work_dir='/tmp/unused') result = executor._substitute_tokens('plain text no tokens', {}) self.assertEqual(result, 'plain text no tokens') class TestBuildTokenMap(unittest.TestCase): + """Verify that token maps are built correctly from task DAG relationships.""" def test_output_only(self): + """A task with no inputs produces a token map containing only the output key.""" spec_text = textwrap.dedent('''\ workflow: name: simple @@ -518,6 +548,7 @@ def test_output_only(self): self.assertEqual(len(tokens), 1) def test_with_upstream_inputs(self): + """A task with upstream inputs gets both index-based and name-based input tokens.""" spec_text = textwrap.dedent('''\ workflow: name: serial @@ -550,9 +581,11 @@ class TestValidateForLocal(unittest.TestCase): """Verify that unsupported features are detected and rejected.""" def _make_executor(self) -> LocalExecutor: + """Create a LocalExecutor with a throwaway work directory for validation-only tests.""" return LocalExecutor(work_dir='/tmp/unused') def test_simple_spec_passes(self): + """A spec using only task-to-task inputs passes local validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -567,6 +600,7 @@ def test_simple_spec_passes(self): executor._validate_for_local(spec) def test_dataset_input_rejected(self): + """A spec with dataset inputs is rejected as unsupported in local mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -586,6 +620,7 @@ def test_dataset_input_rejected(self): self.assertIn('dataset', str(context.exception)) def test_url_input_rejected(self): + """A spec with URL inputs is rejected as unsupported in local mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -604,6 +639,7 @@ def test_url_input_rejected(self): self.assertIn('URL', str(context.exception)) def test_dataset_output_rejected(self): + """A spec with dataset outputs is rejected as unsupported in local mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -623,6 +659,7 @@ def test_dataset_output_rejected(self): self.assertIn('dataset', str(context.exception).lower()) def test_url_output_rejected(self): + """A spec with URL outputs is rejected as unsupported in local mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -641,6 +678,7 @@ def test_url_output_rejected(self): self.assertIn('object storage', str(context.exception).lower()) def test_multiple_unsupported_features_all_reported(self): + """All unsupported features across multiple tasks are reported in a single error.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -667,6 +705,7 @@ def test_multiple_unsupported_features_all_reported(self): self.assertIn('task2', error_message) def test_task_deps_only_passes(self): + """A spec with only task-to-task dependencies passes local validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -686,6 +725,7 @@ def test_task_deps_only_passes(self): executor._validate_for_local(spec) def test_files_and_env_pass(self): + """A spec using files and environment variables passes local validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -706,8 +746,10 @@ def test_files_and_env_pass(self): class TestJinjaTemplateDetection(unittest.TestCase): + """Verify that specs containing Jinja template markers are rejected before execution.""" def _write_temp_spec(self, content: str) -> str: + """Write YAML content to a temporary file and return its path.""" f = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) f.write(content) f.flush() @@ -715,6 +757,7 @@ def _write_temp_spec(self, content: str) -> str: return f.name def test_jinja_block_detected(self): + """A spec containing {%% %%} Jinja block tags is rejected.""" path = self._write_temp_spec(textwrap.dedent('''\ workflow: name: {%% if true %%}test{%% endif %%} @@ -731,6 +774,7 @@ def test_jinja_block_detected(self): os.unlink(path) def test_jinja_comment_detected(self): + """A spec containing {# #} Jinja comment tags is rejected.""" path = self._write_temp_spec(textwrap.dedent('''\ {# A comment #} workflow: @@ -748,6 +792,7 @@ def test_jinja_comment_detected(self): os.unlink(path) def test_default_values_section_detected(self): + """A spec containing a 'default-values' section is rejected as a Jinja template.""" path = self._write_temp_spec(textwrap.dedent('''\ workflow: name: "{{experiment_name}}" @@ -777,12 +822,15 @@ class TestDockerExecution(unittest.TestCase): """ def setUp(self): + """Create a temporary work directory for each Docker execution test.""" self.work_dir = tempfile.mkdtemp(prefix='osmo-local-test-') def tearDown(self): + """Remove the temporary work directory after each test.""" shutil.rmtree(self.work_dir, ignore_errors=True) def _execute_spec(self, spec_text: str) -> bool: + """Parse and execute a workflow spec string, returning the success status.""" executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) return executor.execute(spec) @@ -790,6 +838,7 @@ def _execute_spec(self, spec_text: str) -> bool: # ---- Single task tests ---- def test_hello_world(self): + """Run a minimal single-task workflow that echoes a message.""" spec_text = textwrap.dedent('''\ workflow: name: hello-osmo @@ -801,6 +850,7 @@ def test_hello_world(self): self.assertTrue(self._execute_spec(spec_text)) def test_single_task_with_args(self): + """Run a task with separate command and args fields.""" spec_text = textwrap.dedent('''\ workflow: name: args-test @@ -813,6 +863,7 @@ def test_single_task_with_args(self): self.assertTrue(self._execute_spec(spec_text)) def test_task_failure_returns_false(self): + """A task that exits with a non-zero code causes execute() to return False.""" spec_text = textwrap.dedent('''\ workflow: name: will-fail @@ -826,6 +877,7 @@ def test_task_failure_returns_false(self): # ---- Environment variable tests ---- def test_environment_variables(self): + """Environment variables declared in the spec are passed to the Docker container.""" spec_text = textwrap.dedent('''\ workflow: name: env-test @@ -843,6 +895,7 @@ def test_environment_variables(self): # ---- Files mount tests ---- def test_inline_file_mounted(self): + """An inline file declared in the spec is mounted and executable inside the container.""" spec_text = textwrap.dedent('''\ workflow: name: files-test @@ -858,6 +911,7 @@ def test_inline_file_mounted(self): self.assertTrue(self._execute_spec(spec_text)) def test_multiple_files_mounted(self): + """Multiple inline files at different paths are all mounted into the container.""" spec_text = textwrap.dedent('''\ workflow: name: multi-files @@ -878,6 +932,7 @@ def test_multiple_files_mounted(self): # ---- Data output tests ---- def test_output_directory_writable(self): + """The {{output}} directory is writable from inside the container and persists on the host.""" spec_text = textwrap.dedent('''\ workflow: name: output-test @@ -896,6 +951,7 @@ def test_output_directory_writable(self): # ---- Serial data flow tests ---- def test_serial_data_flow_two_tasks(self): + """Data written to {{output}} by a producer is readable via {{input:0}} by the consumer.""" spec_text = textwrap.dedent('''\ workflow: name: serial-data @@ -959,7 +1015,7 @@ def test_serial_chain_three_tasks(self): # ---- Parallel execution tests ---- def test_parallel_independent_tasks(self): - """Mimics cookbook/tutorials/parallel_tasks.yaml""" + """Independent tasks with no dependencies all execute and produce their respective outputs.""" spec_text = textwrap.dedent('''\ workflow: name: parallel-tasks @@ -986,6 +1042,7 @@ def test_parallel_independent_tasks(self): # ---- Diamond DAG tests ---- def test_diamond_dag(self): + """A diamond-shaped DAG executes correctly with fan-out and fan-in data flow.""" spec_text = textwrap.dedent('''\ workflow: name: diamond @@ -1025,6 +1082,7 @@ def test_diamond_dag(self): # ---- Failure propagation tests ---- def test_failure_cancels_downstream(self): + """A failed task prevents its downstream dependent from running.""" spec_text = textwrap.dedent('''\ workflow: name: fail-chain @@ -1043,6 +1101,7 @@ def test_failure_cancels_downstream(self): self.assertFalse(os.path.exists(output_file)) def test_parallel_failure_does_not_affect_independent_branch(self): + """When one branch of a parallel DAG fails, the executor stops with overall failure.""" spec_text = textwrap.dedent('''\ workflow: name: partial-fail @@ -1071,6 +1130,7 @@ def test_parallel_failure_does_not_affect_independent_branch(self): # ---- Groups (ganged tasks) tests ---- def test_group_with_single_task(self): + """A group containing a single lead task executes and produces output.""" spec_text = textwrap.dedent('''\ workflow: name: single-group @@ -1125,6 +1185,7 @@ def test_groups_with_data_flow(self): # ---- Input by task name tests ---- def test_input_by_task_name(self): + """The {{input:taskname}} token resolves to the named upstream task's output directory.""" spec_text = textwrap.dedent('''\ workflow: name: named-input @@ -1198,6 +1259,7 @@ def test_resources_ignored_gracefully(self): # ---- Docker-not-found handling ---- def test_docker_not_found_graceful_failure(self): + """Using a non-existent docker binary results in a graceful failure rather than a crash.""" spec_text = textwrap.dedent('''\ workflow: name: no-docker @@ -1217,6 +1279,7 @@ def test_docker_not_found_graceful_failure(self): # ---- Alternative container runtime ---- def test_custom_docker_command(self): + """An explicitly specified docker command is used to run the container.""" spec_text = textwrap.dedent('''\ workflow: name: custom-cmd @@ -1248,12 +1311,15 @@ class TestCookbookSpecs(unittest.TestCase): 'cookbook', 'tutorials') def setUp(self): + """Create a temporary work directory for cookbook spec tests.""" self.work_dir = tempfile.mkdtemp(prefix='osmo-local-cookbook-') def tearDown(self): + """Remove the temporary work directory after each cookbook test.""" shutil.rmtree(self.work_dir, ignore_errors=True) def _run_cookbook_spec(self, filename: str) -> bool: + """Execute a cookbook tutorial spec file through the local executor.""" spec_path = os.path.join(self.COOKBOOK_DIR, filename) if not os.path.exists(spec_path): self.skipTest(f'Cookbook file not found: {spec_path}') @@ -1264,15 +1330,19 @@ def _run_cookbook_spec(self, filename: str) -> bool: ) def test_hello_world_yaml(self): + """Execute the hello_world.yaml cookbook tutorial spec.""" self.assertTrue(self._run_cookbook_spec('hello_world.yaml')) def test_parallel_tasks_yaml(self): + """Execute the parallel_tasks.yaml cookbook tutorial spec.""" self.assertTrue(self._run_cookbook_spec('parallel_tasks.yaml')) def test_serial_workflow_yaml(self): + """Execute the serial_workflow.yaml cookbook tutorial spec.""" self.assertTrue(self._run_cookbook_spec('serial_workflow.yaml')) def test_resources_basic_yaml(self): + """Execute the resources_basic.yaml cookbook tutorial spec.""" self.assertTrue(self._run_cookbook_spec('resources_basic.yaml')) def test_combination_workflow_simple_yaml(self): @@ -1323,12 +1393,15 @@ class TestRunWorkflowLocally(unittest.TestCase): """Test the top-level run_workflow_locally() convenience function.""" def setUp(self): + """Create a temporary work directory for run_workflow_locally tests.""" self.work_dir = tempfile.mkdtemp(prefix='osmo-local-func-') def tearDown(self): + """Remove the temporary work directory after each test.""" shutil.rmtree(self.work_dir, ignore_errors=True) def test_success_cleans_up_when_not_keeping(self): + """On success with keep_work_dir=False, the work directory is removed.""" work_dir = tempfile.mkdtemp(prefix='osmo-local-cleanup-') with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: f.write(textwrap.dedent('''\ @@ -1354,6 +1427,7 @@ def test_success_cleans_up_when_not_keeping(self): shutil.rmtree(work_dir, ignore_errors=True) def test_failure_preserves_work_dir(self): + """On failure, the work directory is preserved for debugging regardless of the keep flag.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: f.write(textwrap.dedent('''\ workflow: @@ -1376,6 +1450,7 @@ def test_failure_preserves_work_dir(self): os.unlink(spec_path) def test_keep_flag_preserves_on_success(self): + """With keep_work_dir=True, the work directory is preserved even on success.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: f.write(textwrap.dedent('''\ workflow: @@ -1398,6 +1473,7 @@ def test_keep_flag_preserves_on_success(self): os.unlink(spec_path) def test_nonexistent_file_raises(self): + """Passing a non-existent spec file path raises FileNotFoundError.""" with self.assertRaises(FileNotFoundError): run_workflow_locally(spec_path='/nonexistent/path/spec.yaml') From 6297dc9393181f60bf46312e27064ec37ee57239 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 17:26:55 -0700 Subject: [PATCH 09/35] Refactor file handling in LocalExecutor for UTF-8 encoding - Updated file operations in `local_executor.py` to explicitly use UTF-8 encoding when reading and writing files, ensuring better compatibility with various text formats. - Adjusted exception handling in `test_local_executor.py` to raise `ValueError` instead of a generic `Exception` for clearer error reporting. - Modified test documentation to reflect the correct Jinja block syntax in error messages. --- src/utils/local_executor.py | 10 +++++----- src/utils/tests/test_local_executor.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 4a84c3ec8..3f16e4e03 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -179,14 +179,14 @@ def _save_state(self): if result.exit_code != -1 } } - with open(self._state_file_path, 'w') as f: + with open(self._state_file_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2) def _load_state(self) -> Dict | None: """Load previously saved task state from disk, returning None if no state file exists.""" if not os.path.exists(self._state_file_path): return None - with open(self._state_file_path) as f: + with open(self._state_file_path, encoding='utf-8') as f: return json.load(f) def _restore_completed_tasks(self, from_step: str | None = None): @@ -348,7 +348,7 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR resolved_contents = self._substitute_tokens(file_spec.contents, token_map) host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) os.makedirs(os.path.dirname(host_path), exist_ok=True) - with open(host_path, 'w') as f: + with open(host_path, 'w', encoding='utf-8') as f: f.write(resolved_contents) resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] @@ -447,10 +447,10 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, work_dir = tempfile.mkdtemp(prefix='osmo-local-') logger.info('Using temporary work directory: %s', work_dir) - with open(spec_path) as f: + with open(spec_path, encoding='utf-8') as f: spec_text = f.read() - template_markers = ('{%%', '{#', 'default-values') + template_markers = ('{%', '{#', 'default-values') if any(marker in spec_text for marker in template_markers): raise ValueError( 'This spec uses Jinja templates which require server-side expansion.\n' diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 673d1f2af..5761c6822 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -156,7 +156,7 @@ def test_invalid_version_rejected(self): command: ["echo", "ok"] ''') executor = LocalExecutor(work_dir='/tmp/unused') - with self.assertRaises(Exception): + with self.assertRaises(ValueError): executor.load_spec(spec_text) def test_both_tasks_and_groups_rejected(self): @@ -176,7 +176,7 @@ def test_both_tasks_and_groups_rejected(self): command: ["echo"] ''') executor = LocalExecutor(work_dir='/tmp/unused') - with self.assertRaises(Exception): + with self.assertRaises(ValueError): executor.load_spec(spec_text) def test_empty_workflow_rejected(self): @@ -186,7 +186,7 @@ def test_empty_workflow_rejected(self): name: empty ''') executor = LocalExecutor(work_dir='/tmp/unused') - with self.assertRaises(Exception): + with self.assertRaises(ValueError): executor.load_spec(spec_text) def test_resources_spec_parsed(self): @@ -757,10 +757,10 @@ def _write_temp_spec(self, content: str) -> str: return f.name def test_jinja_block_detected(self): - """A spec containing {%% %%} Jinja block tags is rejected.""" + """A spec containing {% %} Jinja block tags is rejected.""" path = self._write_temp_spec(textwrap.dedent('''\ workflow: - name: {%% if true %%}test{%% endif %%} + name: {% if true %}test{% endif %} tasks: - name: task image: alpine:3.18 From 0dffb7976f522e74d7dd70f1684b5a3cf9975584 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 18:23:54 -0700 Subject: [PATCH 10/35] Enhance error handling and update documentation in local execution modules - Expanded exception handling in `local.py` to include `FileNotFoundError` and `PermissionError` for improved robustness. - Updated comments in `local_executor.py` to clarify unsupported features in local mode, specifically regarding privileged containers and host networking. - Modified test case in `test_local_executor.py` to ensure caller-supplied work directories are preserved on success, enhancing test accuracy. --- src/cli/local.py | 4 ++-- src/utils/local_executor.py | 15 ++++++++++++--- src/utils/tests/test_local_executor.py | 6 +++--- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/cli/local.py b/src/cli/local.py index 5bf596879..d446e26cf 100644 --- a/src/cli/local.py +++ b/src/cli/local.py @@ -1,5 +1,5 @@ """ -SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -82,7 +82,7 @@ def _run_local(service_client, args: argparse.Namespace): from_step=args.from_step, docker_cmd=args.docker_cmd, ) - except ValueError as error: + except (ValueError, FileNotFoundError, PermissionError) as error: print(f'Error: {error}', file=sys.stderr) sys.exit(1) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 3f16e4e03..9f884f20c 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -1,5 +1,5 @@ """ -SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -289,6 +289,14 @@ def _validate_for_local(self, spec: workflow_module.WorkflowSpec): unsupported_features.append( f'Task "{task_spec.name}": volumeMounts require cluster-level host paths') + if task_spec.privileged: + unsupported_features.append( + f'Task "{task_spec.name}": privileged containers are not supported in local mode') + + if task_spec.hostNetwork: + unsupported_features.append( + f'Task "{task_spec.name}": hostNetwork is not supported in local mode') + if unsupported_features: raise ValueError( 'The following features are not supported in local execution mode:\n - ' @@ -443,7 +451,8 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, raise ValueError( '--resume and --from-step require --work-dir pointing to a previous run directory.') - if work_dir is None: + created_work_dir = work_dir is None + if created_work_dir: work_dir = tempfile.mkdtemp(prefix='osmo-local-') logger.info('Using temporary work directory: %s', work_dir) @@ -463,7 +472,7 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) - if not keep_work_dir and success: + if created_work_dir and not keep_work_dir and success: logger.info('Cleaning up work directory: %s', work_dir) shutil.rmtree(work_dir, ignore_errors=True) elif not success: diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 5761c6822..917d39172 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -1400,8 +1400,8 @@ def tearDown(self): """Remove the temporary work directory after each test.""" shutil.rmtree(self.work_dir, ignore_errors=True) - def test_success_cleans_up_when_not_keeping(self): - """On success with keep_work_dir=False, the work directory is removed.""" + def test_caller_supplied_work_dir_preserved_on_success(self): + """A caller-supplied work_dir is never deleted, even with keep_work_dir=False.""" work_dir = tempfile.mkdtemp(prefix='osmo-local-cleanup-') with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: f.write(textwrap.dedent('''\ @@ -1420,7 +1420,7 @@ def test_success_cleans_up_when_not_keeping(self): keep_work_dir=False, ) self.assertTrue(result) - self.assertFalse(os.path.exists(work_dir)) + self.assertTrue(os.path.exists(work_dir)) finally: os.unlink(spec_path) if os.path.exists(work_dir): From 0a7e15cb3deb4bc4b329098549e5311c0d58413b Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 18:39:45 -0700 Subject: [PATCH 11/35] Update copyright line in test_local_executor.py to comply with pylint standards --- src/utils/tests/test_local_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 917d39172..872aa11ac 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -1,5 +1,5 @@ """ -SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From 84a26daf86dbfd5c7d4de898b2bb8ca6b98ed1ff Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 18:53:44 -0700 Subject: [PATCH 12/35] Add shared memory size support for GPU tasks in local execution - Introduced `--shm-size` argument in the CLI for specifying shared memory size for GPU containers, defaulting to 16g. - Updated `LocalExecutor` to accept and utilize the shared memory size during Docker command construction. - Added unit tests to verify correct handling of shared memory size for both default and custom values in GPU tasks, ensuring no shared memory argument is included for non-GPU tasks. --- src/cli/local.py | 8 +++ src/utils/local_executor.py | 14 +++- src/utils/tests/test_local_executor.py | 90 ++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/src/cli/local.py b/src/cli/local.py index d446e26cf..67eef4ca4 100644 --- a/src/cli/local.py +++ b/src/cli/local.py @@ -68,6 +68,13 @@ def setup_parser(parser: argparse._SubParsersAction): help='Resume from a specific task, re-running it and all downstream tasks. ' 'Tasks upstream of the specified step are skipped if they completed ' 'successfully. Requires --work-dir pointing to the previous run directory.') + run_parser.add_argument( + '--shm-size', + dest='shm_size', + default=None, + help='Shared memory size for GPU containers (e.g. 16g, 32g). ' + 'Defaults to 16g for tasks that request GPUs. ' + 'PyTorch DataLoader workers require large shared memory.') run_parser.set_defaults(func=_run_local) @@ -81,6 +88,7 @@ def _run_local(service_client, args: argparse.Namespace): resume=args.resume, from_step=args.from_step, docker_cmd=args.docker_cmd, + shm_size=args.shm_size, ) except (ValueError, FileNotFoundError, PermissionError) as error: print(f'Error: {error}', file=sys.stderr) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 9f884f20c..d5c8351d1 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -75,11 +75,15 @@ class LocalExecutor: - Templated specs with Jinja (require server-side expansion; use --dry-run first) """ - def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker'): + DEFAULT_SHM_SIZE = '16g' + + def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker', + shm_size: str | None = None): """Initialize the executor with a work directory, cleanup preference, and container runtime command.""" self._work_dir = work_dir self._keep_work_dir = keep_work_dir self._docker_cmd = docker_cmd + self._shm_size = shm_size self._task_nodes: Dict[str, TaskNode] = {} self._results: Dict[str, TaskResult] = {} self._available_gpus: int | None = None @@ -380,6 +384,9 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) + shm_size = self._shm_size or self.DEFAULT_SHM_SIZE + docker_args += ['--shm-size', shm_size] + for key, value in task_spec.environment.items(): resolved_value = self._substitute_tokens(value, token_map) docker_args += ['-e', f'{key}={resolved_value}'] @@ -445,7 +452,8 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, keep_work_dir: bool = False, resume: bool = False, from_step: str | None = None, - docker_cmd: str = 'docker') -> bool: + docker_cmd: str = 'docker', + shm_size: str | None = None) -> bool: """Load a workflow spec from disk and execute it locally via Docker, managing the work directory lifecycle.""" if (resume or from_step) and work_dir is None: raise ValueError( @@ -467,7 +475,7 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, 'then save that output and run it locally.') executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - docker_cmd=docker_cmd) + docker_cmd=docker_cmd, shm_size=shm_size) spec = executor.load_spec(spec_text) success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 872aa11ac..82b49a98f 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -745,6 +745,96 @@ def test_files_and_env_pass(self): executor._validate_for_local(spec) +class TestShmSize(unittest.TestCase): + """Verify that --shm-size is passed to Docker for GPU tasks.""" + + def setUp(self): + """Create a temporary work directory for shm-size tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-shm-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_gpu_task_gets_default_shm_size(self, mock_run): + """A GPU task includes --shm-size with the default value when none is specified.""" + mock_run.return_value = mock.Mock(returncode=0, stdout='0\n') + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-resource: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-resource + command: ["python", "train.py"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['train'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args_list[-1][0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '16g') + + @mock.patch('subprocess.run') + def test_gpu_task_gets_custom_shm_size(self, mock_run): + """A GPU task uses the user-specified --shm-size value.""" + mock_run.return_value = mock.Mock(returncode=0, stdout='0\n') + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-resource: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-resource + command: ["python", "train.py"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['train'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args_list[-1][0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '32g') + + @mock.patch('subprocess.run') + def test_non_gpu_task_has_no_shm_size(self, mock_run): + """A task without GPU resources does not include --shm-size in Docker args.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: no-gpu + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['preprocess'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args[0][0] + self.assertNotIn('--shm-size', docker_call_args) + + class TestJinjaTemplateDetection(unittest.TestCase): """Verify that specs containing Jinja template markers are rejected before execution.""" From 05f4ea6a3e7c599e1e3d2ca91d6db288cab7c758 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 19:19:02 -0700 Subject: [PATCH 13/35] Add tutorial specs filegroup and enhance local executor tests - Created a new `tutorial_specs` filegroup in the `cookbook/tutorials/BUILD` to include YAML specifications. - Updated the `BUILD` file in `src/utils/tests` to include the new `tutorial_specs` as data for local tests. - Added a new test class in `test_local_executor.py` to validate unsupported features in cookbook specifications, ensuring proper error handling for unsupported fields. - Implemented additional tests to verify that specific unsupported features are correctly rejected during local execution. --- cookbook/tutorials/BUILD | 5 + src/utils/tests/BUILD | 3 + src/utils/tests/test_local_executor.py | 245 +++++++++++++++++++------ 3 files changed, 195 insertions(+), 58 deletions(-) create mode 100644 cookbook/tutorials/BUILD diff --git a/cookbook/tutorials/BUILD b/cookbook/tutorials/BUILD new file mode 100644 index 000000000..d56c526f4 --- /dev/null +++ b/cookbook/tutorials/BUILD @@ -0,0 +1,5 @@ +filegroup( + name = "tutorial_specs", + srcs = glob(["*.yaml"]), + visibility = ["//src/utils/tests:__pkg__"], +) diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index a9369af05..efe72682f 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -61,5 +61,8 @@ py_test( deps = [ "//src/utils:local_executor", ], + data = [ + "//cookbook/tutorials:tutorial_specs", + ], local = True, ) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 82b49a98f..25ccac7cb 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -745,6 +745,89 @@ def test_files_and_env_pass(self): executor._validate_for_local(spec) +class TestValidateForLocalRemainingBranches(unittest.TestCase): + """Verify that _validate_for_local rejects credentials, checkpoint, volumeMounts, privileged, and hostNetwork.""" + + _UNSUPPORTED_SPECS = { + 'credentials': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + '''), + 'expected_substring': 'credentials', + }, + 'checkpoint': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + checkpoint: + - path: /output/model + url: s3://bucket/checkpoints/ + frequency: 300 + '''), + 'expected_substring': 'checkpoint', + }, + 'volumeMounts': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + volumeMounts: + - "/data:/data:ro" + '''), + 'expected_substring': 'volumeMounts', + }, + 'privileged': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + privileged: true + '''), + 'expected_substring': 'privileged', + }, + 'hostNetwork': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + hostNetwork: true + '''), + 'expected_substring': 'hostNetwork', + }, + } + + def test_unsupported_fields_rejected(self): + """Each unsupported task-level field is detected and rejected with a descriptive error.""" + for feature, case in self._UNSUPPORTED_SPECS.items(): + with self.subTest(feature=feature): + executor = LocalExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(case['yaml']) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_local(spec) + self.assertIn(case['expected_substring'], str(context.exception)) + + class TestShmSize(unittest.TestCase): """Verify that --shm-size is passed to Docker for GPU tasks.""" @@ -901,6 +984,108 @@ def test_default_values_section_detected(self): os.unlink(path) +# ============================================================================ +# Tests that exercise error paths without requiring Docker +# ============================================================================ +class TestDockerNotFoundHandling(unittest.TestCase): + """Verify graceful failure when Docker is not available (no Docker required to run).""" + + def setUp(self): + """Create a temporary work directory.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-test-') + + def tearDown(self): + """Remove the temporary work directory.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_docker_not_found_graceful_failure(self): + """Using a non-existent docker binary results in a graceful failure rather than a crash.""" + spec_text = textwrap.dedent('''\ + workflow: + name: no-docker + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor( + work_dir=self.work_dir, + keep_work_dir=True, + docker_cmd='nonexistent-docker-binary-12345', + ) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + +class TestCookbookSpecValidation(unittest.TestCase): + """ + Validate that cookbook specs using unsupported features are rejected + before any container is started (no Docker required to run). + """ + + COOKBOOK_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', + 'cookbook', 'tutorials') + + def setUp(self): + """Create a temporary work directory for cookbook validation tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-cookbook-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _run_cookbook_spec(self, filename: str) -> bool: + """Execute a cookbook tutorial spec file through the local executor.""" + spec_path = os.path.join(self.COOKBOOK_DIR, filename) + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') + return run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + + def test_unsupported_spec_data_download(self): + """data_download.yaml uses URL inputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_download.yaml') + self.assertIn('URL', str(context.exception)) + + def test_unsupported_spec_data_upload(self): + """data_upload.yaml uses URL outputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_upload.yaml') + self.assertIn('object storage', str(context.exception).lower()) + + def test_unsupported_spec_dataset_upload(self): + """dataset_upload.yaml uses dataset outputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('dataset_upload.yaml') + self.assertIn('dataset', str(context.exception).lower()) + + def test_unsupported_spec_template(self): + """template_hello_world.yaml uses default-values templating — verify it is rejected.""" + spec_path = os.path.join(self.COOKBOOK_DIR, 'template_hello_world.yaml') + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') + with self.assertRaises(ValueError) as context: + run_workflow_locally( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertIn('Jinja', str(context.exception)) + + +class TestRunWorkflowLocallyErrors(unittest.TestCase): + """Test error handling in run_workflow_locally() that does not require Docker.""" + + def test_nonexistent_file_raises(self): + """Passing a non-existent spec file path raises FileNotFoundError.""" + with self.assertRaises(FileNotFoundError): + run_workflow_locally(spec_path='/nonexistent/path/spec.yaml') + + # ============================================================================ # Integration tests — require Docker; test actual container execution # ============================================================================ @@ -1346,26 +1531,6 @@ def test_resources_ignored_gracefully(self): ''') self.assertTrue(self._execute_spec(spec_text)) - # ---- Docker-not-found handling ---- - - def test_docker_not_found_graceful_failure(self): - """Using a non-existent docker binary results in a graceful failure rather than a crash.""" - spec_text = textwrap.dedent('''\ - workflow: - name: no-docker - tasks: - - name: task - image: alpine:3.18 - command: ["echo", "ok"] - ''') - executor = LocalExecutor( - work_dir=self.work_dir, - keep_work_dir=True, - docker_cmd='nonexistent-docker-binary-12345', - ) - spec = executor.load_spec(spec_text) - self.assertFalse(executor.execute(spec)) - # ---- Alternative container runtime ---- def test_custom_docker_command(self): @@ -1411,8 +1576,8 @@ def tearDown(self): def _run_cookbook_spec(self, filename: str) -> bool: """Execute a cookbook tutorial spec file through the local executor.""" spec_path = os.path.join(self.COOKBOOK_DIR, filename) - if not os.path.exists(spec_path): - self.skipTest(f'Cookbook file not found: {spec_path}') + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') return run_workflow_locally( spec_path=spec_path, work_dir=self.work_dir, @@ -1443,37 +1608,6 @@ def test_combination_workflow_simple_yaml(self): """ self.skipTest('Contains sleep 120; covered by test_groups_with_data_flow') - def test_unsupported_spec_data_download(self): - """data_download.yaml uses URL inputs — verify it is cleanly rejected.""" - with self.assertRaises(ValueError) as context: - self._run_cookbook_spec('data_download.yaml') - self.assertIn('URL', str(context.exception)) - - def test_unsupported_spec_data_upload(self): - """data_upload.yaml uses URL outputs — verify it is cleanly rejected.""" - with self.assertRaises(ValueError) as context: - self._run_cookbook_spec('data_upload.yaml') - self.assertIn('object storage', str(context.exception).lower()) - - def test_unsupported_spec_dataset_upload(self): - """dataset_upload.yaml uses dataset outputs — verify it is cleanly rejected.""" - with self.assertRaises(ValueError) as context: - self._run_cookbook_spec('dataset_upload.yaml') - self.assertIn('dataset', str(context.exception).lower()) - - def test_unsupported_spec_template(self): - """template_hello_world.yaml uses default-values templating — verify it is rejected.""" - spec_path = os.path.join(self.COOKBOOK_DIR, 'template_hello_world.yaml') - if not os.path.exists(spec_path): - self.skipTest('Cookbook file not found') - with self.assertRaises(ValueError) as context: - run_workflow_locally( - spec_path=spec_path, - work_dir=self.work_dir, - keep_work_dir=True, - ) - self.assertIn('Jinja', str(context.exception)) - # ============================================================================ # run_workflow_locally() integration tests @@ -1562,11 +1696,6 @@ def test_keep_flag_preserves_on_success(self): finally: os.unlink(spec_path) - def test_nonexistent_file_raises(self): - """Passing a non-existent spec file path raises FileNotFoundError.""" - with self.assertRaises(FileNotFoundError): - run_workflow_locally(spec_path='/nonexistent/path/spec.yaml') - if __name__ == '__main__': unittest.main() From 684d554b007062f2a4182689d8f731180c2e5531 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 19:36:03 -0700 Subject: [PATCH 14/35] Implement file path validation in LocalExecutor to prevent directory traversal - Enhanced the `LocalExecutor` class to validate file paths, ensuring they do not escape the task directory. This prevents potential security risks associated with directory traversal attacks. - Added unit tests in `test_local_executor.py` to verify that invalid file paths raise appropriate exceptions, while valid paths are accepted without errors. - Updated documentation in `AGENTS.md` to include the new local executor functionality for Docker-based workflow execution. --- AGENTS.md | 2 + src/utils/local_executor.py | 7 ++- src/utils/tests/test_local_executor.py | 59 ++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 45b388253..3c8129e5a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -120,6 +120,7 @@ Entry point: `service/core/service.py`. Framework: FastAPI + Uvicorn + OpenTelem | `utils/job/` | `Task`, `FrontendJob`, `K8sObjectFactory`, `PodGroupTopologyBuilder` | Workflow execution framework. Task → K8s spec generation. Gang scheduling via PodGroup. Topology constraints. Backend job definitions. | | `utils/connectors/` | `ClusterConnector`, `PostgresConnector`, `RedisConnector` | K8s API wrapper, PostgreSQL operations, Redis job queue management. | | `utils/secret_manager/` | `SecretManager` | JWE-based secret encryption/decryption. MEK/UEK key management. | +| `utils/local_executor.py` | `LocalExecutor`, `run_workflow_locally` | Local Docker-based workflow execution. Runs workflow specs without Kubernetes by mapping tasks to `docker run` commands with volume mounts for data flow. Supports DAG scheduling, resume (`--from-step`), and GPU passthrough. | | `utils/progress_check/` | — | Liveness/progress tracking for long-running services. | | `utils/metrics/` | — | Prometheus metrics collection and export. | @@ -139,6 +140,7 @@ Entry point: `cli.py` → `main_parser.py` (argparse). Subcommand modules: | `login.py` | Authentication | | `pool.py`, `resources.py`, `user.py`, `credential.py`, `access_token.py`, `bucket.py`, `task.py`, `version.py` | Supporting commands | | `backend.py` | Backend cluster management | +| `local.py` | Local workflow execution via Docker (`osmo local run`) | Features: Tab completion (shtab), response formatting (`formatters.py`), spec editor (`editor.py`), PyInstaller packaging (`cli_builder.py`, `packaging/`). diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index d5c8351d1..01c11b6de 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -358,7 +358,10 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR for file_spec in task_spec.files: resolved_contents = self._substitute_tokens(file_spec.contents, token_map) - host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) + host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) + if not host_path.startswith(os.path.realpath(files_dir) + os.sep): + raise ValueError( + f'Task "{node.name}": file path "{file_spec.path}" escapes the task directory') os.makedirs(os.path.dirname(host_path), exist_ok=True) with open(host_path, 'w', encoding='utf-8') as f: f.write(resolved_contents) @@ -400,7 +403,7 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args += ['-v', f'{upstream_result.output_dir}:{input_mount}:ro'] for file_spec in task_spec.files: - host_path = os.path.join(files_dir, file_spec.path.lstrip('/')) + host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] if resolved_command: diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 25ccac7cb..7d3f19dd4 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -828,6 +828,65 @@ def test_unsupported_fields_rejected(self): self.assertIn(case['expected_substring'], str(context.exception)) +class TestFilePathTraversal(unittest.TestCase): + """Verify that file paths cannot escape the task directory.""" + + def setUp(self): + """Create a temporary work directory.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-traversal-') + + def tearDown(self): + """Remove the temporary work directory.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_path_traversal_rejected(self, mock_run): + """A file spec with a path that escapes the task directory raises ValueError.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: traversal + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "malicious" + path: /../../etc/evil.conf + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['task'] + with self.assertRaises(ValueError) as context: + executor._run_task(node, spec) + self.assertIn('escapes the task directory', str(context.exception)) + + @mock.patch('subprocess.run') + def test_safe_nested_path_accepted(self, mock_run): + """A file spec with a safe nested path is accepted without error.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: safe + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "safe" + path: /tmp/scripts/run.sh + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['task'] + executor._run_task(node, spec) + mock_run.assert_called_once() + + class TestShmSize(unittest.TestCase): """Verify that --shm-size is passed to Docker for GPU tasks.""" From 88bc29535a133d752f27bae3abd5c5529ee00589 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 20:15:00 -0700 Subject: [PATCH 15/35] Clear GPU device specification in Docker arguments for LocalExecutor - Updated the `LocalExecutor` class to remove unnecessary quotes around GPU device specifications in Docker command arguments, ensuring correct formatting. - Cleared previous results at the start of the `execute` method to prevent data carryover between executions. --- src/utils/local_executor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 01c11b6de..729f2f5af 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -120,6 +120,7 @@ def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: def execute(self, spec: workflow_module.WorkflowSpec, resume: bool = False, from_step: str | None = None) -> bool: """Run all tasks in topological order, returning True if the entire workflow succeeds.""" + self._results.clear() self._build_dag(spec) self._validate_for_local(spec) self._setup_directories() @@ -382,9 +383,9 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR logger.warning( 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', node.name, gpu_count, available, available) - docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(available))}"'] + docker_args += ['--gpus', f'device={",".join(str(i) for i in range(available))}'] else: - docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] + docker_args += ['--gpus', f'device={",".join(str(i) for i in range(gpu_count))}'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) shm_size = self._shm_size or self.DEFAULT_SHM_SIZE From 93fe43c2c9f27cc9176dfb0e5366509ff07345cb Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 3 Apr 2026 20:34:53 -0700 Subject: [PATCH 16/35] Refactor shared memory size handling in LocalExecutor - Updated the `LocalExecutor` class to ensure the `--shm-size` argument is included for CPU-only tasks when explicitly specified by the user. - Adjusted the logic for setting the shared memory size to improve clarity and maintainability. - Enhanced unit tests in `test_local_executor.py` to verify correct behavior for tasks with and without GPU resources regarding shared memory size. --- src/utils/local_executor.py | 15 +++++++------- src/utils/tests/test_local_executor.py | 28 ++++++++++++++++++++++++-- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 729f2f5af..da6d61f94 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -388,8 +388,9 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args += ['--gpus', f'device={",".join(str(i) for i in range(gpu_count))}'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) - shm_size = self._shm_size or self.DEFAULT_SHM_SIZE - docker_args += ['--shm-size', shm_size] + docker_args += ['--shm-size', self._shm_size or self.DEFAULT_SHM_SIZE] + elif self._shm_size: + docker_args += ['--shm-size', self._shm_size] for key, value in task_spec.environment.items(): resolved_value = self._substitute_tokens(value, token_map) @@ -463,11 +464,6 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, raise ValueError( '--resume and --from-step require --work-dir pointing to a previous run directory.') - created_work_dir = work_dir is None - if created_work_dir: - work_dir = tempfile.mkdtemp(prefix='osmo-local-') - logger.info('Using temporary work directory: %s', work_dir) - with open(spec_path, encoding='utf-8') as f: spec_text = f.read() @@ -478,6 +474,11 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, 'Run "osmo workflow submit --dry-run -f " first to get the expanded spec,\n' 'then save that output and run it locally.') + created_work_dir = work_dir is None + if created_work_dir: + work_dir = tempfile.mkdtemp(prefix='osmo-local-') + logger.info('Using temporary work directory: %s', work_dir) + executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, docker_cmd=docker_cmd, shm_size=shm_size) spec = executor.load_spec(spec_text) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 7d3f19dd4..97226248d 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -955,8 +955,8 @@ def test_gpu_task_gets_custom_shm_size(self, mock_run): self.assertEqual(docker_call_args[shm_index + 1], '32g') @mock.patch('subprocess.run') - def test_non_gpu_task_has_no_shm_size(self, mock_run): - """A task without GPU resources does not include --shm-size in Docker args.""" + def test_non_gpu_task_has_no_default_shm_size(self, mock_run): + """A CPU-only task without explicit shm_size does not include --shm-size.""" mock_run.return_value = mock.Mock(returncode=0) spec_text = textwrap.dedent('''\ workflow: @@ -976,6 +976,30 @@ def test_non_gpu_task_has_no_shm_size(self, mock_run): docker_call_args = mock_run.call_args[0][0] self.assertNotIn('--shm-size', docker_call_args) + @mock.patch('subprocess.run') + def test_non_gpu_task_gets_explicit_shm_size(self, mock_run): + """A CPU-only task gets --shm-size when the user explicitly specifies it.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: no-gpu + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='8g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['preprocess'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args[0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '8g') + class TestJinjaTemplateDetection(unittest.TestCase): """Verify that specs containing Jinja template markers are rejected before execution.""" From ed0d746b9b1402b23ad39cfc7caa2ea4b236762b Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 11:36:42 -0700 Subject: [PATCH 17/35] Refactor LocalExecutor to use container paths for token mapping - Updated the `LocalExecutor` class to replace host paths with container-side paths in the token mapping for output and input sources. - Adjusted the `_build_token_map` method to eliminate the output directory parameter, aligning with the new container path structure. - Modified Docker argument construction to reflect the new paths, ensuring correct volume mounts for inputs and outputs. - Enhanced unit tests in `test_local_executor.py` to validate the updated token mapping behavior and ensure proper handling of container paths. --- src/utils/local_executor.py | 22 +++++++++++----------- src/utils/tests/test_local_executor.py | 16 ++++++++-------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index da6d61f94..a66a5bb9c 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -35,6 +35,7 @@ logger = logging.getLogger(__name__) STATE_FILE_NAME = '.osmo-state.json' +CONTAINER_DATA_PATH = '/osmo/data' @dataclasses.dataclass @@ -355,7 +356,7 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR files_dir = os.path.join(task_dir, 'files') os.makedirs(files_dir, exist_ok=True) - token_map = self._build_token_map(node, output_dir) + token_map = self._build_token_map(node) for file_spec in task_spec.files: resolved_contents = self._substitute_tokens(file_spec.contents, token_map) @@ -396,13 +397,12 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR resolved_value = self._substitute_tokens(value, token_map) docker_args += ['-e', f'{key}={resolved_value}'] - docker_args += ['-v', f'{output_dir}:{output_dir}'] + docker_args += ['-v', f'{output_dir}:{CONTAINER_DATA_PATH}/output'] for index, input_source in enumerate(task_spec.inputs): if isinstance(input_source, task_module.TaskInputOutput): upstream_result = self._results[input_source.task] - input_mount = token_map.get(f'input:{index}', upstream_result.output_dir) - docker_args += ['-v', f'{upstream_result.output_dir}:{input_mount}:ro'] + docker_args += ['-v', f'{upstream_result.output_dir}:{CONTAINER_DATA_PATH}/input/{index}:ro'] for file_spec in task_spec.files: host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) @@ -434,16 +434,16 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR logger.error('Docker not found. Is Docker installed and in your PATH?') return TaskResult(name=node.name, exit_code=127, output_dir=output_dir) - def _build_token_map(self, node: TaskNode, output_dir: str) -> Dict[str, str]: - """Build a mapping of {{token}} keys to host paths for output and each upstream input.""" + def _build_token_map(self, node: TaskNode) -> Dict[str, str]: + """Build a mapping of {{token}} keys to container-side paths matching on-cluster layout.""" tokens: Dict[str, str] = { - 'output': output_dir, + 'output': f'{CONTAINER_DATA_PATH}/output', } for index, input_source in enumerate(node.spec.inputs): if isinstance(input_source, task_module.TaskInputOutput): - upstream_result = self._results[input_source.task] - tokens[f'input:{input_source.task}'] = upstream_result.output_dir - tokens[f'input:{index}'] = upstream_result.output_dir + container_input_path = f'{CONTAINER_DATA_PATH}/input/{index}' + tokens[f'input:{input_source.task}'] = container_input_path + tokens[f'input:{index}'] = container_input_path return tokens def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: @@ -475,7 +475,7 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, 'then save that output and run it locally.') created_work_dir = work_dir is None - if created_work_dir: + if work_dir is None: work_dir = tempfile.mkdtemp(prefix='osmo-local-') logger.info('Using temporary work directory: %s', work_dir) diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 97226248d..a4cb6b587 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -25,7 +25,7 @@ from unittest import mock from src.utils.job import task as task_module -from src.utils.local_executor import LocalExecutor, TaskNode, TaskResult, run_workflow_locally +from src.utils.local_executor import CONTAINER_DATA_PATH, LocalExecutor, TaskNode, TaskResult, run_workflow_locally # --------------------------------------------------------------------------- @@ -543,12 +543,12 @@ def test_output_only(self): executor._build_dag(spec) node = executor._task_nodes['task1'] - tokens = executor._build_token_map(node, '/tmp/work/task1/output') - self.assertEqual(tokens['output'], '/tmp/work/task1/output') + tokens = executor._build_token_map(node) + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') self.assertEqual(len(tokens), 1) def test_with_upstream_inputs(self): - """A task with upstream inputs gets both index-based and name-based input tokens.""" + """A task with upstream inputs gets both index-based and name-based input tokens pointing to container paths.""" spec_text = textwrap.dedent('''\ workflow: name: serial @@ -570,11 +570,11 @@ def test_with_upstream_inputs(self): name='producer', exit_code=0, output_dir='/tmp/work/producer/output') node = executor._task_nodes['consumer'] - tokens = executor._build_token_map(node, '/tmp/work/consumer/output') + tokens = executor._build_token_map(node) - self.assertEqual(tokens['output'], '/tmp/work/consumer/output') - self.assertEqual(tokens['input:0'], '/tmp/work/producer/output') - self.assertEqual(tokens['input:producer'], '/tmp/work/producer/output') + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') + self.assertEqual(tokens['input:0'], f'{CONTAINER_DATA_PATH}/input/0') + self.assertEqual(tokens['input:producer'], f'{CONTAINER_DATA_PATH}/input/0') class TestValidateForLocal(unittest.TestCase): From b1bf7924b0261f630edb1a715b484439bb57aa38 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 11:46:38 -0700 Subject: [PATCH 18/35] Add cycle detection in LocalExecutor for task DAGs - Implemented a `_check_for_cycles` method in the `LocalExecutor` class to identify and raise a `ValueError` for circular dependencies in task DAGs. - Added a new test class `TestCycleDetection` in `test_local_executor.py` to validate the detection of direct and indirect cycles, ensuring proper error reporting for various cyclic configurations. - Enhanced existing tests to confirm that non-cyclic task structures are accepted without errors. --- src/utils/local_executor.py | 31 ++++++ src/utils/tests/test_local_executor.py | 146 +++++++++++++++++++++++++ 2 files changed, 177 insertions(+) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index a66a5bb9c..563935803 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -265,6 +265,37 @@ def _build_dag(self, spec: workflow_module.WorkflowSpec): self._task_nodes[task_spec.name].upstream.add(upstream_task) self._task_nodes[upstream_task].downstream.add(task_spec.name) + self._check_for_cycles() + + def _check_for_cycles(self): + """Raise ValueError if the task DAG contains any cycles, reporting the cycle path.""" + UNVISITED, IN_PROGRESS, DONE = 0, 1, 2 + state: Dict[str, int] = {name: UNVISITED for name in self._task_nodes} + path: List[str] = [] + + def visit(name: str) -> List[str] | None: + if state[name] == DONE: + return None + if state[name] == IN_PROGRESS: + cycle_start = path.index(name) + return path[cycle_start:] + [name] + + state[name] = IN_PROGRESS + path.append(name) + for downstream in self._task_nodes[name].downstream: + cycle = visit(downstream) + if cycle is not None: + return cycle + path.pop() + state[name] = DONE + return None + + for name in self._task_nodes: + cycle = visit(name) + if cycle is not None: + raise ValueError( + f'Circular dependency detected: {" -> ".join(cycle)}') + def _validate_for_local(self, spec: workflow_module.WorkflowSpec): """Raise ValueError if the spec uses features unsupported in local mode (datasets, URLs, credentials, etc.).""" unsupported_features = [] diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index a4cb6b587..81561f56c 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -371,6 +371,152 @@ def test_groups_with_cross_group_deps(self): self.assertEqual(executor._task_nodes['transform'].upstream, {'download'}) +class TestCycleDetection(unittest.TestCase): + """Verify that circular dependencies are detected and reported during DAG construction.""" + + def _make_executor(self) -> LocalExecutor: + """Create a LocalExecutor with a throwaway work directory for cycle-detection tests.""" + return LocalExecutor(work_dir='/tmp/unused') + + def test_direct_cycle_two_tasks(self): + """Two tasks that depend on each other form a direct cycle and are rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cycle + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + error_message = str(context.exception) + self.assertIn('Circular dependency', error_message) + self.assertIn('a', error_message) + self.assertIn('b', error_message) + + def test_indirect_cycle_three_tasks(self): + """Three tasks forming a cycle (a -> b -> c -> a) are rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cycle + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: c + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + self.assertIn('Circular dependency', str(context.exception)) + + def test_cycle_in_subgraph_with_valid_root(self): + """A cycle in a subgraph is detected even when other tasks have no cycle.""" + spec_text = textwrap.dedent('''\ + workflow: + name: partial-cycle + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - task: b + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + error_message = str(context.exception) + self.assertIn('Circular dependency', error_message) + self.assertIn('a', error_message) + self.assertIn('b', error_message) + + def test_no_cycle_linear_chain(self): + """A linear chain (a -> b -> c) has no cycle and is accepted.""" + spec_text = textwrap.dedent('''\ + workflow: + name: linear + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + def test_no_cycle_diamond(self): + """A diamond DAG (root -> left/right -> join) has no cycle and is accepted.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + class TestFindReadyTasks(unittest.TestCase): """Verify correct identification of tasks ready to execute.""" From 04615ca8ad0e143a9a0bb00eb8822b969e40f471 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 11:57:51 -0700 Subject: [PATCH 19/35] Enhance LocalExecutor with lead-task failure policy and host token support - Added support for a lead-task failure policy in the `LocalExecutor`, allowing non-lead task failures to be ignored when `ignoreNonleadStatus` is set to true. - Implemented checks to determine if a task's failure can be ignored based on its lead status and group configuration. - Introduced validation for tasks using `{{host:taskname}}` tokens, ensuring proper error handling for unsupported configurations. - Expanded unit tests in `test_local_executor.py` to cover various scenarios for lead-task behavior and host token usage, ensuring robust functionality. --- src/utils/local_executor.py | 65 +++++++-- src/utils/tests/test_local_executor.py | 194 +++++++++++++++++++++++++ 2 files changed, 246 insertions(+), 13 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 563935803..832f8e4a1 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -63,7 +63,8 @@ class LocalExecutor: Executes an OSMO workflow spec locally using Docker, without Kubernetes. Supports: - - Serial and parallel task DAGs (groups flattened to individual tasks) + - Serial and parallel task DAGs + - Task groups with lead-task failure policy (ignoreNonleadStatus) - {{output}} and {{input:N}} / {{input:taskname}} token substitution - Inline `files:` written to the container - `environment:` passed as Docker env vars @@ -74,6 +75,7 @@ class LocalExecutor: - Dataset / URL inputs/outputs (require object storage) - Credentials, checkpoints, volumeMounts (require cluster infra) - Templated specs with Jinja (require server-side expansion; use --dry-run first) + - {{host:taskname}} tokens (require parallel containers with shared networking) """ DEFAULT_SHM_SIZE = '16g' @@ -86,6 +88,7 @@ def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = self._docker_cmd = docker_cmd self._shm_size = shm_size self._task_nodes: Dict[str, TaskNode] = {} + self._group_specs: Dict[str, task_module.TaskGroupSpec] = {} self._results: Dict[str, TaskResult] = {} self._available_gpus: int | None = None @@ -149,11 +152,17 @@ def execute(self, spec: workflow_module.WorkflowSpec, self._save_state() if result.exit_code != 0: - logger.error('Task "%s" failed with exit code %d', task_name, result.exit_code) - self._cancel_downstream(task_name) - return False - - logger.info('Task "%s" completed successfully', task_name) + if self._is_nonlead_failure_ignorable(task_name): + logger.warning( + 'Non-lead task "%s" failed with exit code %d ' + '(ignored — group "%s" has ignoreNonleadStatus=true)', + task_name, result.exit_code, node.group) + else: + logger.error('Task "%s" failed with exit code %d', task_name, result.exit_code) + self._cancel_downstream(task_name) + return False + else: + logger.info('Task "%s" completed successfully', task_name) ready = self._find_ready_tasks() @@ -163,9 +172,12 @@ def execute(self, spec: workflow_module.WorkflowSpec, spec.name, ', '.join(sorted(unexecuted))) return False - failed = [name for name, r in self._results.items() if r.exit_code != 0] - if failed: - logger.error('Workflow failed. Failed tasks: %s', ', '.join(failed)) + fatal_failures = [ + name for name, r in self._results.items() + if r.exit_code != 0 and not self._is_nonlead_failure_ignorable(name) + ] + if fatal_failures: + logger.error('Workflow failed. Failed tasks: %s', ', '.join(fatal_failures)) return False logger.info('Workflow "%s" completed successfully', spec.name) @@ -243,11 +255,11 @@ def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGr def _build_dag(self, spec: workflow_module.WorkflowSpec): """Construct the internal DAG of TaskNodes from the workflow spec's tasks and input dependencies.""" self._task_nodes.clear() - task_to_group: Dict[str, str] = {} + self._group_specs.clear() for group in self._groups(spec): + self._group_specs[group.name] = group for task_spec in group.tasks: - task_to_group[task_spec.name] = group.name self._task_nodes[task_spec.name] = TaskNode( name=task_spec.name, spec=task_spec, @@ -296,6 +308,8 @@ def visit(name: str) -> List[str] | None: raise ValueError( f'Circular dependency detected: {" -> ".join(cycle)}') + _HOST_TOKEN_PATTERN = re.compile(r'\{\{\s*host:[^}]+\}\}') + def _validate_for_local(self, spec: workflow_module.WorkflowSpec): """Raise ValueError if the spec uses features unsupported in local mode (datasets, URLs, credentials, etc.).""" unsupported_features = [] @@ -334,26 +348,51 @@ def _validate_for_local(self, spec: workflow_module.WorkflowSpec): unsupported_features.append( f'Task "{task_spec.name}": hostNetwork is not supported in local mode') + if self._task_uses_host_tokens(task_spec): + unsupported_features.append( + f'Task "{task_spec.name}": {{{{host:taskname}}}} tokens require ' + f'parallel containers with shared networking') + if unsupported_features: raise ValueError( 'The following features are not supported in local execution mode:\n - ' + '\n - '.join(unsupported_features)) + def _task_uses_host_tokens(self, task_spec: task_module.TaskSpec) -> bool: + """Return True if any text field in the task spec contains {{host:...}} tokens.""" + fields_to_check = list(task_spec.command) + list(task_spec.args) + fields_to_check += list(task_spec.environment.values()) + fields_to_check += [file_spec.contents for file_spec in task_spec.files] + return any(self._HOST_TOKEN_PATTERN.search(field) for field in fields_to_check) + def _setup_directories(self): """Create the work directory and per-task output directories on the host filesystem.""" os.makedirs(self._work_dir, exist_ok=True) for task_name in self._task_nodes: os.makedirs(os.path.join(self._work_dir, task_name, 'output'), exist_ok=True) + def _is_nonlead_failure_ignorable(self, task_name: str) -> bool: + """Return True if the task is a non-lead task in a group with ignoreNonleadStatus=true.""" + node = self._task_nodes[task_name] + group_spec = self._group_specs[node.group] + return group_spec.ignoreNonleadStatus and not node.spec.lead + + def _is_task_satisfied(self, task_name: str) -> bool: + """Return True if a completed task's result counts as satisfied for downstream scheduling.""" + result = self._results[task_name] + if result.exit_code == 0: + return True + return self._is_nonlead_failure_ignorable(task_name) + def _find_ready_tasks(self) -> List[str]: - """Return tasks whose upstream dependencies have all completed successfully.""" + """Return tasks whose upstream dependencies have all been satisfied, in spec declaration order.""" completed = set(self._results.keys()) ready = [] for name, node in self._task_nodes.items(): if name in completed: continue if node.upstream.issubset(completed): - all_upstream_ok = all(self._results[u].exit_code == 0 for u in node.upstream) + all_upstream_ok = all(self._is_task_satisfied(u) for u in node.upstream) if all_upstream_ok: ready.append(name) return ready diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 81561f56c..5fb38bb06 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -960,6 +960,64 @@ class TestValidateForLocalRemainingBranches(unittest.TestCase): '''), 'expected_substring': 'hostNetwork', }, + 'host_token_in_args': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo"] + args: ["--peer={{host:follower}}"] + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, + 'host_token_in_env': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo"] + environment: + PEER_HOST: "{{ host:follower }}" + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, + 'host_token_in_files': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "connecting to {{host:follower}}" + path: /tmp/run.sh + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, } def test_unsupported_fields_rejected(self): @@ -1033,6 +1091,142 @@ def test_safe_nested_path_accepted(self, mock_run): mock_run.assert_called_once() +class TestLeadTaskFailurePolicy(unittest.TestCase): + """Verify ignoreNonleadStatus behavior: non-lead failures are tolerated when the flag is true.""" + + def setUp(self): + """Create a temporary work directory for lead-task policy tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-lead-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_nonlead_failure_ignored_when_flag_true(self, mock_run): + """With ignoreNonleadStatus=true (default), a non-lead failure does not abort the workflow.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_lead_failure_aborts_workflow(self, mock_run): + """Even with ignoreNonleadStatus=true, a lead task failure aborts the workflow.""" + mock_run.return_value = mock.Mock(returncode=1) + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: follower + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_nonlead_failure_aborts_when_flag_false(self, mock_run): + """With ignoreNonleadStatus=false, a non-lead failure aborts the workflow.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + ignoreNonleadStatus: false + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_nonlead_failure_does_not_block_downstream_group(self, mock_run): + """A tolerated non-lead failure does not prevent a downstream group from running.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + mock.Mock(returncode=0), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: downstream-after-nonlead-fail + groups: + - name: first + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: second + tasks: + - name: consumer + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + inputs: + - task: leader + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + self.assertEqual(mock_run.call_count, 3) + + @mock.patch('subprocess.run') + def test_single_task_group_failure_aborts(self, mock_run): + """A single-task group (auto-promoted to lead) aborts on failure like normal.""" + mock_run.return_value = mock.Mock(returncode=1) + spec_text = textwrap.dedent('''\ + workflow: + name: single-fail + tasks: + - name: only-task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + class TestShmSize(unittest.TestCase): """Verify that --shm-size is passed to Docker for GPU tasks.""" From 3c88f8ff35069c1c7674a642c82ed151cd1dc2ce Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 12:37:18 -0700 Subject: [PATCH 20/35] Enhance LocalExecutor with unresolved token detection - Added functionality to detect unresolved {{variable}} tokens in command, args, environment values, and file contents before executing tasks in the `LocalExecutor`. - Implemented a `_check_unresolved_tokens` method to raise a ValueError if unresolved tokens are found, providing a helpful error message suggesting the use of a dry-run for template expansion. - Introduced a new test class `TestUnresolvedTokenDetection` in `test_local_executor.py` to validate the detection of unresolved tokens across various scenarios, ensuring robust error handling. --- src/utils/local_executor.py | 26 ++++- src/utils/tests/test_local_executor.py | 140 +++++++++++++++++++++++++ 2 files changed, 163 insertions(+), 3 deletions(-) diff --git a/src/utils/local_executor.py b/src/utils/local_executor.py index 832f8e4a1..c516a46fb 100644 --- a/src/utils/local_executor.py +++ b/src/utils/local_executor.py @@ -440,6 +440,11 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] resolved_args = [self._substitute_tokens(a, token_map) for a in task_spec.args] + resolved_env_values = [self._substitute_tokens(v, token_map) for v in task_spec.environment.values()] + + all_resolved = resolved_command + resolved_args + resolved_env_values + all_resolved += [self._substitute_tokens(f.contents, token_map) for f in task_spec.files] + self._check_unresolved_tokens(node.name, all_resolved) docker_args = [self._docker_cmd, 'run', '--rm'] @@ -463,9 +468,8 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR elif self._shm_size: docker_args += ['--shm-size', self._shm_size] - for key, value in task_spec.environment.items(): - resolved_value = self._substitute_tokens(value, token_map) - docker_args += ['-e', f'{key}={resolved_value}'] + for env_key, resolved_value in zip(task_spec.environment.keys(), resolved_env_values): + docker_args += ['-e', f'{env_key}={resolved_value}'] docker_args += ['-v', f'{output_dir}:{CONTAINER_DATA_PATH}/output'] @@ -516,12 +520,28 @@ def _build_token_map(self, node: TaskNode) -> Dict[str, str]: tokens[f'input:{index}'] = container_input_path return tokens + _UNRESOLVED_TOKEN_PATTERN = re.compile(r'\{\{[^}]+\}\}') + def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: """Replace all {{key}} placeholders in text with their corresponding token values.""" for key, value in tokens.items(): text = re.sub(r'\{\{\s*' + re.escape(key) + r'\s*\}\}', value, text) return text + def _check_unresolved_tokens(self, task_name: str, resolved_fields: List[str]): + """Raise ValueError if any resolved field still contains {{ }} placeholders.""" + unresolved: List[str] = [] + for field in resolved_fields: + for match in self._UNRESOLVED_TOKEN_PATTERN.finditer(field): + token = match.group(0) + if token not in unresolved: + unresolved.append(token) + if unresolved: + raise ValueError( + f'Task "{task_name}" has unresolved token(s): {", ".join(unresolved)}. ' + f'If this spec uses Jinja templates, run "osmo workflow submit --dry-run -f " ' + f'first to expand them.') + def run_workflow_locally(spec_path: str, work_dir: str | None = None, keep_work_dir: bool = False, diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_local_executor.py index 5fb38bb06..3c234c0dc 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_local_executor.py @@ -1227,6 +1227,146 @@ def test_single_task_group_failure_aborts(self, mock_run): self.assertFalse(executor.execute(spec)) +class TestUnresolvedTokenDetection(unittest.TestCase): + """Verify that unresolved {{variable}} tokens are detected before running containers.""" + + def setUp(self): + """Create a temporary work directory for unresolved token tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-local-tokens-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_jinja_variable_in_args_detected(self): + """A bare {{variable}} in args (without default-values section) is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["echo"] + args: ["{{experiment_name}}"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('unresolved', str(context.exception).lower()) + self.assertIn('experiment_name', str(context.exception)) + + def test_jinja_variable_in_command_detected(self): + """A bare {{variable}} in command is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["{{my_binary}}"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('my_binary', str(context.exception)) + + def test_jinja_variable_in_env_detected(self): + """A bare {{variable}} in environment values is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["echo"] + environment: + MY_VAR: "{{some_value}}" + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('some_value', str(context.exception)) + + def test_jinja_variable_in_file_contents_detected(self): + """A bare {{variable}} in file contents is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo {{config_path}}/data + path: /tmp/run.sh + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('config_path', str(context.exception)) + + def test_typo_in_osmo_token_detected(self): + """A typo in an OSMO token (e.g., {{ouptut}}) is caught as unresolved.""" + spec_text = textwrap.dedent('''\ + workflow: + name: typo + tasks: + - name: task + image: "alpine:3.18" + command: ["sh", "-c"] + args: ["echo data > {{ouptut}}/file.txt"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('ouptut', str(context.exception)) + + @mock.patch('subprocess.run') + def test_valid_osmo_tokens_not_flagged(self, mock_run): + """Valid OSMO tokens ({{output}}, {{input:0}}) are resolved and not flagged as unresolved.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: valid + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo ok > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{ output }}/result.txt"] + inputs: + - task: producer + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor.execute(spec) + + def test_error_message_suggests_dry_run(self): + """The unresolved token error message suggests using --dry-run to expand templates.""" + spec_text = textwrap.dedent('''\ + workflow: + name: helpful + tasks: + - name: task + image: "alpine:3.18" + command: ["echo", "{{missing}}"] + ''') + executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('dry-run', str(context.exception)) + + class TestShmSize(unittest.TestCase): """Verify that --shm-size is passed to Docker for GPU tasks.""" From 38a9e27cc46db4ca1d1c71b0ffa69bddeed360df Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 12:53:52 -0700 Subject: [PATCH 21/35] Refactor LocalExecutor to StandaloneExecutor for Docker-based execution - Replaced the `LocalExecutor` with `StandaloneExecutor` to facilitate Docker-based workflow execution without Kubernetes. - Updated relevant documentation in `AGENTS.md` to reflect the new executor's capabilities and entry points. - Modified CLI commands and parser to integrate the `standalone` subcommand for executing workflows in standalone mode. - Adjusted build configurations to include the new `standalone_executor` and updated test references accordingly. - Introduced comprehensive tests for the `StandaloneExecutor` to ensure functionality and validate workflow execution scenarios. --- AGENTS.md | 4 +- src/cli/BUILD | 4 +- src/cli/main_parser.py | 4 +- src/cli/{local.py => standalone.py} | 22 +- src/utils/BUILD | 4 +- ...cal_executor.py => standalone_executor.py} | 38 ++-- src/utils/tests/BUILD | 6 +- ...xecutor.py => test_standalone_executor.py} | 208 +++++++++--------- 8 files changed, 144 insertions(+), 146 deletions(-) rename src/cli/{local.py => standalone.py} (80%) rename src/utils/{local_executor.py => standalone_executor.py} (95%) rename src/utils/tests/{test_local_executor.py => test_standalone_executor.py} (92%) diff --git a/AGENTS.md b/AGENTS.md index 3c8129e5a..90a401f2b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -120,7 +120,7 @@ Entry point: `service/core/service.py`. Framework: FastAPI + Uvicorn + OpenTelem | `utils/job/` | `Task`, `FrontendJob`, `K8sObjectFactory`, `PodGroupTopologyBuilder` | Workflow execution framework. Task → K8s spec generation. Gang scheduling via PodGroup. Topology constraints. Backend job definitions. | | `utils/connectors/` | `ClusterConnector`, `PostgresConnector`, `RedisConnector` | K8s API wrapper, PostgreSQL operations, Redis job queue management. | | `utils/secret_manager/` | `SecretManager` | JWE-based secret encryption/decryption. MEK/UEK key management. | -| `utils/local_executor.py` | `LocalExecutor`, `run_workflow_locally` | Local Docker-based workflow execution. Runs workflow specs without Kubernetes by mapping tasks to `docker run` commands with volume mounts for data flow. Supports DAG scheduling, resume (`--from-step`), and GPU passthrough. | +| `utils/standalone_executor.py` | `StandaloneExecutor`, `run_workflow_standalone` | Standalone Docker-based workflow execution. Runs workflow specs without Kubernetes by mapping tasks to `docker run` commands with volume mounts for data flow. Supports DAG scheduling, resume (`--from-step`), and GPU passthrough. | | `utils/progress_check/` | — | Liveness/progress tracking for long-running services. | | `utils/metrics/` | — | Prometheus metrics collection and export. | @@ -140,7 +140,7 @@ Entry point: `cli.py` → `main_parser.py` (argparse). Subcommand modules: | `login.py` | Authentication | | `pool.py`, `resources.py`, `user.py`, `credential.py`, `access_token.py`, `bucket.py`, `task.py`, `version.py` | Supporting commands | | `backend.py` | Backend cluster management | -| `local.py` | Local workflow execution via Docker (`osmo local run`) | +| `standalone.py` | Standalone workflow execution via Docker (`osmo standalone run`) | Features: Tab completion (shtab), response formatting (`formatters.py`), spec editor (`editor.py`), PyInstaller packaging (`cli_builder.py`, `packaging/`). diff --git a/src/cli/BUILD b/src/cli/BUILD index cdada591a..eade2ea71 100755 --- a/src/cli/BUILD +++ b/src/cli/BUILD @@ -37,7 +37,7 @@ osmo_py_library( "dataset.py", "editor.py", "formatters.py", - "local.py", + "standalone.py", "login.py", "main_parser.py", "pool.py", @@ -74,7 +74,7 @@ osmo_py_library( "//src/lib/utils:validation", "//src/lib/utils:version", "//src/lib/utils:workflow", - "//src/utils:local_executor", + "//src/utils:standalone_executor", ], ) diff --git a/src/cli/main_parser.py b/src/cli/main_parser.py index bd097111d..59dfc0043 100644 --- a/src/cli/main_parser.py +++ b/src/cli/main_parser.py @@ -28,7 +28,7 @@ credential, data, dataset, - local, + standalone, login, pool, profile, @@ -57,7 +57,7 @@ pool.setup_parser, user.setup_parser, config.setup_parser, - local.setup_parser, + standalone.setup_parser, ) diff --git a/src/cli/local.py b/src/cli/standalone.py similarity index 80% rename from src/cli/local.py rename to src/cli/standalone.py index 67eef4ca4..d30e764ff 100644 --- a/src/cli/local.py +++ b/src/cli/standalone.py @@ -21,20 +21,20 @@ import shtab -from src.utils import local_executor +from src.utils import standalone_executor def setup_parser(parser: argparse._SubParsersAction): - """Register the 'local' subcommand and its nested 'run' action with the CLI argument parser.""" - local_parser = parser.add_parser( - 'local', - help='Run workflows locally using Docker (no Kubernetes cluster required).') - subparsers = local_parser.add_subparsers(dest='command') + """Register the 'standalone' subcommand and its nested 'run' action with the CLI argument parser.""" + standalone_parser = parser.add_parser( + 'standalone', + help='Run workflows in standalone mode using Docker containers (no Kubernetes cluster required).') + subparsers = standalone_parser.add_subparsers(dest='command') subparsers.required = True run_parser = subparsers.add_parser( 'run', - help='Execute a workflow spec locally using Docker containers.') + help='Execute a workflow spec in standalone mode using Docker containers.') run_parser.add_argument( '-f', '--file', required=True, @@ -75,13 +75,13 @@ def setup_parser(parser: argparse._SubParsersAction): help='Shared memory size for GPU containers (e.g. 16g, 32g). ' 'Defaults to 16g for tasks that request GPUs. ' 'PyTorch DataLoader workers require large shared memory.') - run_parser.set_defaults(func=_run_local) + run_parser.set_defaults(func=_run_standalone) -def _run_local(service_client, args: argparse.Namespace): - """Execute a workflow locally via Docker using the parsed CLI arguments.""" +def _run_standalone(service_client, args: argparse.Namespace): + """Execute a workflow in standalone mode via Docker using the parsed CLI arguments.""" try: - success = local_executor.run_workflow_locally( + success = standalone_executor.run_workflow_standalone( spec_path=args.workflow_file, work_dir=args.work_dir, keep_work_dir=args.keep, diff --git a/src/utils/BUILD b/src/utils/BUILD index 8a29aa9af..8dbf59a94 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -128,8 +128,8 @@ osmo_py_library( ) osmo_py_library( - name = "local_executor", - srcs = ["local_executor.py"], + name = "standalone_executor", + srcs = ["standalone_executor.py"], deps = [ requirement("pyyaml"), "//src/utils/job", diff --git a/src/utils/local_executor.py b/src/utils/standalone_executor.py similarity index 95% rename from src/utils/local_executor.py rename to src/utils/standalone_executor.py index c516a46fb..8f4493140 100644 --- a/src/utils/local_executor.py +++ b/src/utils/standalone_executor.py @@ -58,9 +58,9 @@ class TaskResult: output_dir: str -class LocalExecutor: +class StandaloneExecutor: """ - Executes an OSMO workflow spec locally using Docker, without Kubernetes. + Executes an OSMO workflow spec in standalone mode using Docker, without Kubernetes. Supports: - Serial and parallel task DAGs @@ -126,7 +126,7 @@ def execute(self, spec: workflow_module.WorkflowSpec, """Run all tasks in topological order, returning True if the entire workflow succeeds.""" self._results.clear() self._build_dag(spec) - self._validate_for_local(spec) + self._validate_for_standalone(spec) self._setup_directories() if resume or from_step: @@ -310,8 +310,8 @@ def visit(name: str) -> List[str] | None: _HOST_TOKEN_PATTERN = re.compile(r'\{\{\s*host:[^}]+\}\}') - def _validate_for_local(self, spec: workflow_module.WorkflowSpec): - """Raise ValueError if the spec uses features unsupported in local mode (datasets, URLs, credentials, etc.).""" + def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): + """Raise ValueError if the spec uses features unsupported in standalone mode (datasets, URLs, credentials, etc.).""" unsupported_features = [] for group in self._groups(spec): for task_spec in group.tasks: @@ -342,11 +342,11 @@ def _validate_for_local(self, spec: workflow_module.WorkflowSpec): if task_spec.privileged: unsupported_features.append( - f'Task "{task_spec.name}": privileged containers are not supported in local mode') + f'Task "{task_spec.name}": privileged containers are not supported in standalone mode') if task_spec.hostNetwork: unsupported_features.append( - f'Task "{task_spec.name}": hostNetwork is not supported in local mode') + f'Task "{task_spec.name}": hostNetwork is not supported in standalone mode') if self._task_uses_host_tokens(task_spec): unsupported_features.append( @@ -355,7 +355,7 @@ def _validate_for_local(self, spec: workflow_module.WorkflowSpec): if unsupported_features: raise ValueError( - 'The following features are not supported in local execution mode:\n - ' + 'The following features are not supported in standalone execution mode:\n - ' + '\n - '.join(unsupported_features)) def _task_uses_host_tokens(self, task_spec: task_module.TaskSpec) -> bool: @@ -543,13 +543,13 @@ def _check_unresolved_tokens(self, task_name: str, resolved_fields: List[str]): f'first to expand them.') -def run_workflow_locally(spec_path: str, work_dir: str | None = None, - keep_work_dir: bool = False, - resume: bool = False, - from_step: str | None = None, - docker_cmd: str = 'docker', - shm_size: str | None = None) -> bool: - """Load a workflow spec from disk and execute it locally via Docker, managing the work directory lifecycle.""" +def run_workflow_standalone(spec_path: str, work_dir: str | None = None, + keep_work_dir: bool = False, + resume: bool = False, + from_step: str | None = None, + docker_cmd: str = 'docker', + shm_size: str | None = None) -> bool: + """Load a workflow spec from disk and execute it in standalone mode via Docker, managing the work directory lifecycle.""" if (resume or from_step) and work_dir is None: raise ValueError( '--resume and --from-step require --work-dir pointing to a previous run directory.') @@ -562,15 +562,15 @@ def run_workflow_locally(spec_path: str, work_dir: str | None = None, raise ValueError( 'This spec uses Jinja templates which require server-side expansion.\n' 'Run "osmo workflow submit --dry-run -f " first to get the expanded spec,\n' - 'then save that output and run it locally.') + 'then save that output and run it standalone.') created_work_dir = work_dir is None if work_dir is None: - work_dir = tempfile.mkdtemp(prefix='osmo-local-') + work_dir = tempfile.mkdtemp(prefix='osmo-standalone-') logger.info('Using temporary work directory: %s', work_dir) - executor = LocalExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - docker_cmd=docker_cmd, shm_size=shm_size) + executor = StandaloneExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd=docker_cmd, shm_size=shm_size) spec = executor.load_spec(spec_text) success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index efe72682f..b555f85bc 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -56,10 +56,10 @@ osmo_py_test( ) py_test( - name = "test_local_executor", - srcs = ["test_local_executor.py"], + name = "test_standalone_executor", + srcs = ["test_standalone_executor.py"], deps = [ - "//src/utils:local_executor", + "//src/utils:standalone_executor", ], data = [ "//cookbook/tutorials:tutorial_specs", diff --git a/src/utils/tests/test_local_executor.py b/src/utils/tests/test_standalone_executor.py similarity index 92% rename from src/utils/tests/test_local_executor.py rename to src/utils/tests/test_standalone_executor.py index 3c234c0dc..2eae253f6 100644 --- a/src/utils/tests/test_local_executor.py +++ b/src/utils/tests/test_standalone_executor.py @@ -25,7 +25,7 @@ from unittest import mock from src.utils.job import task as task_module -from src.utils.local_executor import CONTAINER_DATA_PATH, LocalExecutor, TaskNode, TaskResult, run_workflow_locally +from src.utils.standalone_executor import CONTAINER_DATA_PATH, StandaloneExecutor, TaskNode, TaskResult, run_workflow_standalone # --------------------------------------------------------------------------- @@ -65,7 +65,7 @@ def test_single_task_spec(self): command: ["echo"] args: ["Hello from OSMO!"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(spec.name, 'hello-osmo') self.assertEqual(len(spec.tasks), 1) @@ -98,7 +98,7 @@ def test_serial_tasks_spec(self): inputs: - task: task1 ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(spec.name, 'serial-tasks') self.assertEqual(len(spec.tasks), 2) @@ -123,7 +123,7 @@ def test_groups_spec(self): image: ubuntu:24.04 command: ["echo", "follower"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(len(spec.groups), 1) self.assertEqual(len(spec.groups[0].tasks), 2) @@ -140,7 +140,7 @@ def test_versioned_spec(self): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(spec.name, 'versioned') @@ -155,7 +155,7 @@ def test_invalid_version_rejected(self): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') with self.assertRaises(ValueError): executor.load_spec(spec_text) @@ -175,7 +175,7 @@ def test_both_tasks_and_groups_rejected(self): image: alpine:3.18 command: ["echo"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') with self.assertRaises(ValueError): executor.load_spec(spec_text) @@ -185,7 +185,7 @@ def test_empty_workflow_rejected(self): workflow: name: empty ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') with self.assertRaises(ValueError): executor.load_spec(spec_text) @@ -204,7 +204,7 @@ def test_resources_spec_parsed(self): image: ubuntu:24.04 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(spec.resources['default'].cpu, 2) self.assertEqual(spec.resources['default'].memory, '4Gi') @@ -222,7 +222,7 @@ def test_environment_parsed(self): MY_VAR: hello ANOTHER: world ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) self.assertEqual(spec.tasks[0].environment['MY_VAR'], 'hello') self.assertEqual(spec.tasks[0].environment['ANOTHER'], 'world') @@ -231,9 +231,9 @@ def test_environment_parsed(self): class TestBuildDag(unittest.TestCase): """Verify DAG construction from task dependencies.""" - def _make_executor(self) -> LocalExecutor: - """Create a LocalExecutor with a throwaway work directory for DAG-only tests.""" - return LocalExecutor(work_dir='/tmp/unused') + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for DAG-only tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') def test_no_dependencies(self): """All tasks with no input dependencies have empty upstream and downstream sets.""" @@ -374,9 +374,9 @@ def test_groups_with_cross_group_deps(self): class TestCycleDetection(unittest.TestCase): """Verify that circular dependencies are detected and reported during DAG construction.""" - def _make_executor(self) -> LocalExecutor: - """Create a LocalExecutor with a throwaway work directory for cycle-detection tests.""" - return LocalExecutor(work_dir='/tmp/unused') + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for cycle-detection tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') def test_direct_cycle_two_tasks(self): """Two tasks that depend on each other form a direct cycle and are rejected.""" @@ -533,7 +533,7 @@ def test_all_root_tasks_ready(self): image: alpine:3.18 command: ["echo"] ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -555,7 +555,7 @@ def test_dependent_not_ready_until_upstream_completes(self): inputs: - task: first ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -581,7 +581,7 @@ def test_failed_upstream_blocks_downstream(self): inputs: - task: first ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -613,7 +613,7 @@ def test_cascading_cancel(self): inputs: - task: b ''') - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -631,42 +631,42 @@ class TestSubstituteTokens(unittest.TestCase): def test_output_token(self): """The {{output}} token is replaced with the task output directory path.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') tokens = {'output': '/work/task1/output'} result = executor._substitute_tokens('echo data > {{output}}/file.txt', tokens) self.assertEqual(result, 'echo data > /work/task1/output/file.txt') def test_input_by_index(self): """The {{input:N}} token is replaced with the Nth upstream output directory.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') tokens = {'input:0': '/work/upstream/output'} result = executor._substitute_tokens('cat {{input:0}}/data.csv', tokens) self.assertEqual(result, 'cat /work/upstream/output/data.csv') def test_input_by_name(self): """The {{input:taskname}} token is replaced with the named task's output directory.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') tokens = {'input:task1': '/work/task1/output'} result = executor._substitute_tokens('cat {{ input:task1 }}/data.csv', tokens) self.assertEqual(result, 'cat /work/task1/output/data.csv') def test_whitespace_around_tokens(self): """Whitespace inside {{ token }} braces is tolerated during substitution.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') tokens = {'output': '/out'} result = executor._substitute_tokens('{{ output }}/file.txt', tokens) self.assertEqual(result, '/out/file.txt') def test_multiple_tokens_in_one_string(self): """Multiple distinct tokens in the same string are all replaced.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') tokens = {'output': '/out', 'input:0': '/in0'} result = executor._substitute_tokens('cp {{input:0}}/src {{output}}/dst', tokens) self.assertEqual(result, 'cp /in0/src /out/dst') def test_no_tokens_unchanged(self): """Text without any token placeholders passes through unchanged.""" - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') result = executor._substitute_tokens('plain text no tokens', {}) self.assertEqual(result, 'plain text no tokens') @@ -684,7 +684,7 @@ def test_output_only(self): image: alpine:3.18 command: ["echo"] ''') - executor = LocalExecutor(work_dir='/tmp/work') + executor = StandaloneExecutor(work_dir='/tmp/work') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -708,7 +708,7 @@ def test_with_upstream_inputs(self): inputs: - task: producer ''') - executor = LocalExecutor(work_dir='/tmp/work') + executor = StandaloneExecutor(work_dir='/tmp/work') spec = executor.load_spec(spec_text) executor._build_dag(spec) @@ -723,15 +723,15 @@ def test_with_upstream_inputs(self): self.assertEqual(tokens['input:producer'], f'{CONTAINER_DATA_PATH}/input/0') -class TestValidateForLocal(unittest.TestCase): +class TestValidateForStandalone(unittest.TestCase): """Verify that unsupported features are detected and rejected.""" - def _make_executor(self) -> LocalExecutor: - """Create a LocalExecutor with a throwaway work directory for validation-only tests.""" - return LocalExecutor(work_dir='/tmp/unused') + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for validation-only tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') def test_simple_spec_passes(self): - """A spec using only task-to-task inputs passes local validation.""" + """A spec using only task-to-task inputs passes standalone validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -743,10 +743,10 @@ def test_simple_spec_passes(self): executor = self._make_executor() spec = executor.load_spec(spec_text) executor._build_dag(spec) - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) def test_dataset_input_rejected(self): - """A spec with dataset inputs is rejected as unsupported in local mode.""" + """A spec with dataset inputs is rejected as unsupported in standalone mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -762,11 +762,11 @@ def test_dataset_input_rejected(self): spec = executor.load_spec(spec_text) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) self.assertIn('dataset', str(context.exception)) def test_url_input_rejected(self): - """A spec with URL inputs is rejected as unsupported in local mode.""" + """A spec with URL inputs is rejected as unsupported in standalone mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -781,11 +781,11 @@ def test_url_input_rejected(self): spec = executor.load_spec(spec_text) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) self.assertIn('URL', str(context.exception)) def test_dataset_output_rejected(self): - """A spec with dataset outputs is rejected as unsupported in local mode.""" + """A spec with dataset outputs is rejected as unsupported in standalone mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -801,11 +801,11 @@ def test_dataset_output_rejected(self): spec = executor.load_spec(spec_text) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) self.assertIn('dataset', str(context.exception).lower()) def test_url_output_rejected(self): - """A spec with URL outputs is rejected as unsupported in local mode.""" + """A spec with URL outputs is rejected as unsupported in standalone mode.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -820,7 +820,7 @@ def test_url_output_rejected(self): spec = executor.load_spec(spec_text) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) self.assertIn('object storage', str(context.exception).lower()) def test_multiple_unsupported_features_all_reported(self): @@ -845,13 +845,13 @@ def test_multiple_unsupported_features_all_reported(self): spec = executor.load_spec(spec_text) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) error_message = str(context.exception) self.assertIn('task1', error_message) self.assertIn('task2', error_message) def test_task_deps_only_passes(self): - """A spec with only task-to-task dependencies passes local validation.""" + """A spec with only task-to-task dependencies passes standalone validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -868,10 +868,10 @@ def test_task_deps_only_passes(self): executor = self._make_executor() spec = executor.load_spec(spec_text) executor._build_dag(spec) - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) def test_files_and_env_pass(self): - """A spec using files and environment variables passes local validation.""" + """A spec using files and environment variables passes standalone validation.""" spec_text = textwrap.dedent('''\ workflow: name: ok @@ -888,11 +888,11 @@ def test_files_and_env_pass(self): executor = self._make_executor() spec = executor.load_spec(spec_text) executor._build_dag(spec) - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) -class TestValidateForLocalRemainingBranches(unittest.TestCase): - """Verify that _validate_for_local rejects credentials, checkpoint, volumeMounts, privileged, and hostNetwork.""" +class TestValidateForStandaloneRemainingBranches(unittest.TestCase): + """Verify that _validate_for_standalone rejects credentials, checkpoint, volumeMounts, privileged, and hostNetwork.""" _UNSUPPORTED_SPECS = { 'credentials': { @@ -1024,11 +1024,11 @@ def test_unsupported_fields_rejected(self): """Each unsupported task-level field is detected and rejected with a descriptive error.""" for feature, case in self._UNSUPPORTED_SPECS.items(): with self.subTest(feature=feature): - executor = LocalExecutor(work_dir='/tmp/unused') + executor = StandaloneExecutor(work_dir='/tmp/unused') spec = executor.load_spec(case['yaml']) executor._build_dag(spec) with self.assertRaises(ValueError) as context: - executor._validate_for_local(spec) + executor._validate_for_standalone(spec) self.assertIn(case['expected_substring'], str(context.exception)) @@ -1037,7 +1037,7 @@ class TestFilePathTraversal(unittest.TestCase): def setUp(self): """Create a temporary work directory.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-traversal-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-traversal-') def tearDown(self): """Remove the temporary work directory.""" @@ -1058,7 +1058,7 @@ def test_path_traversal_rejected(self, mock_run): - contents: "malicious" path: /../../etc/evil.conf ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1082,7 +1082,7 @@ def test_safe_nested_path_accepted(self, mock_run): - contents: "safe" path: /tmp/scripts/run.sh ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1096,7 +1096,7 @@ class TestLeadTaskFailurePolicy(unittest.TestCase): def setUp(self): """Create a temporary work directory for lead-task policy tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-lead-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-lead-') def tearDown(self): """Remove the temporary work directory after each test.""" @@ -1123,7 +1123,7 @@ def test_nonlead_failure_ignored_when_flag_true(self, mock_run): image: alpine:3.18 command: ["sh", "-c", "exit 1"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) self.assertTrue(executor.execute(spec)) @@ -1145,7 +1145,7 @@ def test_lead_failure_aborts_workflow(self, mock_run): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) self.assertFalse(executor.execute(spec)) @@ -1171,7 +1171,7 @@ def test_nonlead_failure_aborts_when_flag_false(self, mock_run): image: alpine:3.18 command: ["sh", "-c", "exit 1"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) self.assertFalse(executor.execute(spec)) @@ -1205,7 +1205,7 @@ def test_nonlead_failure_does_not_block_downstream_group(self, mock_run): inputs: - task: leader ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) self.assertTrue(executor.execute(spec)) self.assertEqual(mock_run.call_count, 3) @@ -1222,7 +1222,7 @@ def test_single_task_group_failure_aborts(self, mock_run): image: alpine:3.18 command: ["sh", "-c", "exit 1"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) self.assertFalse(executor.execute(spec)) @@ -1232,7 +1232,7 @@ class TestUnresolvedTokenDetection(unittest.TestCase): def setUp(self): """Create a temporary work directory for unresolved token tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-tokens-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-tokens-') def tearDown(self): """Remove the temporary work directory after each test.""" @@ -1249,7 +1249,7 @@ def test_jinja_variable_in_args_detected(self): command: ["echo"] args: ["{{experiment_name}}"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1266,7 +1266,7 @@ def test_jinja_variable_in_command_detected(self): image: "alpine:3.18" command: ["{{my_binary}}"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1284,7 +1284,7 @@ def test_jinja_variable_in_env_detected(self): environment: MY_VAR: "{{some_value}}" ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1304,7 +1304,7 @@ def test_jinja_variable_in_file_contents_detected(self): echo {{config_path}}/data path: /tmp/run.sh ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1321,7 +1321,7 @@ def test_typo_in_osmo_token_detected(self): command: ["sh", "-c"] args: ["echo data > {{ouptut}}/file.txt"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1346,7 +1346,7 @@ def test_valid_osmo_tokens_not_flagged(self, mock_run): inputs: - task: producer ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) executor.execute(spec) @@ -1360,7 +1360,7 @@ def test_error_message_suggests_dry_run(self): image: "alpine:3.18" command: ["echo", "{{missing}}"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) @@ -1372,7 +1372,7 @@ class TestShmSize(unittest.TestCase): def setUp(self): """Create a temporary work directory for shm-size tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-shm-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-shm-') def tearDown(self): """Remove the temporary work directory after each test.""" @@ -1394,7 +1394,7 @@ def test_gpu_task_gets_default_shm_size(self, mock_run): resource: gpu-resource command: ["python", "train.py"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1422,7 +1422,7 @@ def test_gpu_task_gets_custom_shm_size(self, mock_run): resource: gpu-resource command: ["python", "train.py"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1446,7 +1446,7 @@ def test_non_gpu_task_has_no_default_shm_size(self, mock_run): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1468,7 +1468,7 @@ def test_non_gpu_task_gets_explicit_shm_size(self, mock_run): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='8g') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='8g') spec = executor.load_spec(spec_text) executor._build_dag(spec) executor._setup_directories() @@ -1504,7 +1504,7 @@ def test_jinja_block_detected(self): ''')) try: with self.assertRaises(ValueError) as context: - run_workflow_locally(path) + run_workflow_standalone(path) self.assertIn('Jinja', str(context.exception)) finally: os.unlink(path) @@ -1522,7 +1522,7 @@ def test_jinja_comment_detected(self): ''')) try: with self.assertRaises(ValueError) as context: - run_workflow_locally(path) + run_workflow_standalone(path) self.assertIn('Jinja', str(context.exception)) finally: os.unlink(path) @@ -1541,7 +1541,7 @@ def test_default_values_section_detected(self): ''')) try: with self.assertRaises(ValueError) as context: - run_workflow_locally(path) + run_workflow_standalone(path) self.assertIn('Jinja', str(context.exception)) finally: os.unlink(path) @@ -1555,7 +1555,7 @@ class TestDockerNotFoundHandling(unittest.TestCase): def setUp(self): """Create a temporary work directory.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-test-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-test-') def tearDown(self): """Remove the temporary work directory.""" @@ -1571,7 +1571,7 @@ def test_docker_not_found_graceful_failure(self): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor( + executor = StandaloneExecutor( work_dir=self.work_dir, keep_work_dir=True, docker_cmd='nonexistent-docker-binary-12345', @@ -1591,18 +1591,18 @@ class TestCookbookSpecValidation(unittest.TestCase): def setUp(self): """Create a temporary work directory for cookbook validation tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-cookbook-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cookbook-') def tearDown(self): """Remove the temporary work directory after each test.""" shutil.rmtree(self.work_dir, ignore_errors=True) def _run_cookbook_spec(self, filename: str) -> bool: - """Execute a cookbook tutorial spec file through the local executor.""" + """Execute a cookbook tutorial spec file through the standalone executor.""" spec_path = os.path.join(self.COOKBOOK_DIR, filename) self.assertTrue(os.path.exists(spec_path), f'Cookbook file not found: {spec_path}') - return run_workflow_locally( + return run_workflow_standalone( spec_path=spec_path, work_dir=self.work_dir, keep_work_dir=True, @@ -1632,7 +1632,7 @@ def test_unsupported_spec_template(self): self.assertTrue(os.path.exists(spec_path), f'Cookbook file not found: {spec_path}') with self.assertRaises(ValueError) as context: - run_workflow_locally( + run_workflow_standalone( spec_path=spec_path, work_dir=self.work_dir, keep_work_dir=True, @@ -1640,13 +1640,13 @@ def test_unsupported_spec_template(self): self.assertIn('Jinja', str(context.exception)) -class TestRunWorkflowLocallyErrors(unittest.TestCase): - """Test error handling in run_workflow_locally() that does not require Docker.""" +class TestRunWorkflowStandaloneErrors(unittest.TestCase): + """Test error handling in run_workflow_standalone() that does not require Docker.""" def test_nonexistent_file_raises(self): """Passing a non-existent spec file path raises FileNotFoundError.""" with self.assertRaises(FileNotFoundError): - run_workflow_locally(spec_path='/nonexistent/path/spec.yaml') + run_workflow_standalone(spec_path='/nonexistent/path/spec.yaml') # ============================================================================ @@ -1655,13 +1655,13 @@ def test_nonexistent_file_raises(self): @unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) class TestDockerExecution(unittest.TestCase): """ - Integration tests that run real OSMO workflow specs through the local executor + Integration tests that run real OSMO workflow specs through the standalone executor using Docker. Each test uses a spec that would normally run on a Kubernetes cluster. """ def setUp(self): """Create a temporary work directory for each Docker execution test.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-test-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-test-') def tearDown(self): """Remove the temporary work directory after each test.""" @@ -1669,7 +1669,7 @@ def tearDown(self): def _execute_spec(self, spec_text: str) -> bool: """Parse and execute a workflow spec string, returning the success status.""" - executor = LocalExecutor(work_dir=self.work_dir, keep_work_dir=True) + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) spec = executor.load_spec(spec_text) return executor.execute(spec) @@ -1961,8 +1961,6 @@ def test_parallel_failure_does_not_affect_independent_branch(self): - task: root ''') result = self._execute_spec(spec_text) - # The executor should stop on first failure, so the overall result is False. - # root succeeds, then one of the branches fails. self.assertFalse(result) # ---- Groups (ganged tasks) tests ---- @@ -2078,7 +2076,7 @@ def test_file_contents_with_token_substitution(self): # ---- Resource spec ignored gracefully ---- def test_resources_ignored_gracefully(self): - """Resource specs are K8s-specific; local executor should accept and ignore them.""" + """Resource specs are K8s-specific; standalone executor should accept and ignore them.""" spec_text = textwrap.dedent('''\ workflow: name: with-resources @@ -2106,7 +2104,7 @@ def test_custom_docker_command(self): image: alpine:3.18 command: ["echo", "ok"] ''') - executor = LocalExecutor( + executor = StandaloneExecutor( work_dir=self.work_dir, keep_work_dir=True, docker_cmd='docker', @@ -2122,7 +2120,7 @@ def test_custom_docker_command(self): class TestCookbookSpecs(unittest.TestCase): """ Run real OSMO cookbook YAML specs that are designed for Kubernetes clusters, - and verify they execute successfully in the local Docker executor. + and verify they execute successfully in the standalone Docker executor. """ COOKBOOK_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', @@ -2130,18 +2128,18 @@ class TestCookbookSpecs(unittest.TestCase): def setUp(self): """Create a temporary work directory for cookbook spec tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-cookbook-') + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cookbook-') def tearDown(self): """Remove the temporary work directory after each cookbook test.""" shutil.rmtree(self.work_dir, ignore_errors=True) def _run_cookbook_spec(self, filename: str) -> bool: - """Execute a cookbook tutorial spec file through the local executor.""" + """Execute a cookbook tutorial spec file through the standalone executor.""" spec_path = os.path.join(self.COOKBOOK_DIR, filename) self.assertTrue(os.path.exists(spec_path), f'Cookbook file not found: {spec_path}') - return run_workflow_locally( + return run_workflow_standalone( spec_path=spec_path, work_dir=self.work_dir, keep_work_dir=True, @@ -2173,15 +2171,15 @@ def test_combination_workflow_simple_yaml(self): # ============================================================================ -# run_workflow_locally() integration tests +# run_workflow_standalone() integration tests # ============================================================================ @unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) -class TestRunWorkflowLocally(unittest.TestCase): - """Test the top-level run_workflow_locally() convenience function.""" +class TestRunWorkflowStandalone(unittest.TestCase): + """Test the top-level run_workflow_standalone() convenience function.""" def setUp(self): - """Create a temporary work directory for run_workflow_locally tests.""" - self.work_dir = tempfile.mkdtemp(prefix='osmo-local-func-') + """Create a temporary work directory for run_workflow_standalone tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-func-') def tearDown(self): """Remove the temporary work directory after each test.""" @@ -2189,7 +2187,7 @@ def tearDown(self): def test_caller_supplied_work_dir_preserved_on_success(self): """A caller-supplied work_dir is never deleted, even with keep_work_dir=False.""" - work_dir = tempfile.mkdtemp(prefix='osmo-local-cleanup-') + work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cleanup-') with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: f.write(textwrap.dedent('''\ workflow: @@ -2201,7 +2199,7 @@ def test_caller_supplied_work_dir_preserved_on_success(self): ''')) spec_path = f.name try: - result = run_workflow_locally( + result = run_workflow_standalone( spec_path=spec_path, work_dir=work_dir, keep_work_dir=False, @@ -2226,7 +2224,7 @@ def test_failure_preserves_work_dir(self): ''')) spec_path = f.name try: - result = run_workflow_locally( + result = run_workflow_standalone( spec_path=spec_path, work_dir=self.work_dir, keep_work_dir=False, @@ -2249,7 +2247,7 @@ def test_keep_flag_preserves_on_success(self): ''')) spec_path = f.name try: - result = run_workflow_locally( + result = run_workflow_standalone( spec_path=spec_path, work_dir=self.work_dir, keep_work_dir=True, From 1b705ed6fecc2ad619c97a65ea4b453b555dd1eb Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 9 Apr 2026 14:11:19 -0700 Subject: [PATCH 22/35] Add Docker Compose support for parallel workflow execution - Introduced `ComposeExecutor` to enable Docker Compose-based parallel execution of workflows, extending the functionality of `StandaloneExecutor`. - Updated `AGENTS.md` to document the new `ComposeExecutor` and its capabilities, including wave-parallel scheduling and GPU support. - Added a new `docker_compose.py` CLI subcommand for executing workflows using Docker Compose. - Implemented tests for `ComposeExecutor` to validate functionality and ensure correct generation of Docker Compose files. - Adjusted build configurations to include the new `compose_executor` library and corresponding tests. --- AGENTS.md | 2 + src/cli/docker_compose.py | 85 ++ src/cli/main_parser.py | 2 + src/utils/BUILD | 11 + src/utils/compose_executor.py | 527 +++++++++++ src/utils/tests/BUILD | 10 + src/utils/tests/test_compose_executor.py | 1073 ++++++++++++++++++++++ 7 files changed, 1710 insertions(+) create mode 100644 src/cli/docker_compose.py create mode 100644 src/utils/compose_executor.py create mode 100644 src/utils/tests/test_compose_executor.py diff --git a/AGENTS.md b/AGENTS.md index 90a401f2b..67e102378 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -121,6 +121,7 @@ Entry point: `service/core/service.py`. Framework: FastAPI + Uvicorn + OpenTelem | `utils/connectors/` | `ClusterConnector`, `PostgresConnector`, `RedisConnector` | K8s API wrapper, PostgreSQL operations, Redis job queue management. | | `utils/secret_manager/` | `SecretManager` | JWE-based secret encryption/decryption. MEK/UEK key management. | | `utils/standalone_executor.py` | `StandaloneExecutor`, `run_workflow_standalone` | Standalone Docker-based workflow execution. Runs workflow specs without Kubernetes by mapping tasks to `docker run` commands with volume mounts for data flow. Supports DAG scheduling, resume (`--from-step`), and GPU passthrough. | +| `utils/compose_executor.py` | `ComposeExecutor`, `run_workflow_compose` | Docker Compose-based parallel workflow execution. Extends StandaloneExecutor with wave-parallel scheduling, `{{host:taskname}}` DNS resolution via shared Compose networks, and GPU support via deploy resource reservations. | | `utils/progress_check/` | — | Liveness/progress tracking for long-running services. | | `utils/metrics/` | — | Prometheus metrics collection and export. | @@ -141,6 +142,7 @@ Entry point: `cli.py` → `main_parser.py` (argparse). Subcommand modules: | `pool.py`, `resources.py`, `user.py`, `credential.py`, `access_token.py`, `bucket.py`, `task.py`, `version.py` | Supporting commands | | `backend.py` | Backend cluster management | | `standalone.py` | Standalone workflow execution via Docker (`osmo standalone run`) | +| `docker_compose.py` | Parallel workflow execution via Docker Compose (`osmo docker-compose run`) | Features: Tab completion (shtab), response formatting (`formatters.py`), spec editor (`editor.py`), PyInstaller packaging (`cli_builder.py`, `packaging/`). diff --git a/src/cli/docker_compose.py b/src/cli/docker_compose.py new file mode 100644 index 000000000..ab44e4f62 --- /dev/null +++ b/src/cli/docker_compose.py @@ -0,0 +1,85 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import argparse +import sys + +import shtab + +from src.utils import compose_executor + + +def setup_parser(parser: argparse._SubParsersAction): + """Register the 'docker-compose' subcommand for parallel workflow execution.""" + dc_parser = parser.add_parser( + 'docker-compose', + help='Run workflows using Docker Compose for parallel execution ' + '(no Kubernetes cluster required).') + subparsers = dc_parser.add_subparsers(dest='command') + subparsers.required = True + + run_parser = subparsers.add_parser( + 'run', + help='Execute a workflow spec using Docker Compose for parallel task execution.') + run_parser.add_argument( + '-f', '--file', + required=True, + dest='workflow_file', + help='Path to the workflow YAML spec file.').complete = shtab.FILE + run_parser.add_argument( + '--work-dir', + dest='work_dir', + default=None, + help='Directory for task inputs/outputs and the generated docker-compose.yml. ' + 'Defaults to a temporary directory.') + run_parser.add_argument( + '--keep', + action='store_true', + default=False, + help='Keep the work directory after execution (always kept on failure).') + run_parser.add_argument( + '--compose-cmd', + dest='compose_cmd', + default='docker compose', + help='Docker Compose command to use (e.g. "docker-compose" for V1). ' + 'Default: "docker compose".') + run_parser.add_argument( + '--shm-size', + dest='shm_size', + default=None, + help='Shared memory size for GPU containers (e.g. 16g, 32g). ' + 'Defaults to 16g for tasks that request GPUs.') + run_parser.set_defaults(func=_run_compose) + + +def _run_compose(service_client, args: argparse.Namespace): + """Execute a workflow via Docker Compose using the parsed CLI arguments.""" + try: + success = compose_executor.run_workflow_compose( + spec_path=args.workflow_file, + work_dir=args.work_dir, + keep_work_dir=args.keep, + compose_cmd=args.compose_cmd, + shm_size=args.shm_size, + ) + except (ValueError, FileNotFoundError, PermissionError) as error: + print(f'Error: {error}', file=sys.stderr) + sys.exit(1) + + if not success: + sys.exit(1) diff --git a/src/cli/main_parser.py b/src/cli/main_parser.py index 59dfc0043..654673923 100644 --- a/src/cli/main_parser.py +++ b/src/cli/main_parser.py @@ -28,6 +28,7 @@ credential, data, dataset, + docker_compose, standalone, login, pool, @@ -58,6 +59,7 @@ user.setup_parser, config.setup_parser, standalone.setup_parser, + docker_compose.setup_parser, ) diff --git a/src/utils/BUILD b/src/utils/BUILD index 8dbf59a94..3e5eaae19 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -136,3 +136,14 @@ osmo_py_library( ], visibility = ["//visibility:public"], ) + +osmo_py_library( + name = "compose_executor", + srcs = ["compose_executor.py"], + deps = [ + requirement("pyyaml"), + "//src/utils:standalone_executor", + "//src/utils/job", + ], + visibility = ["//visibility:public"], +) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py new file mode 100644 index 000000000..928ddcada --- /dev/null +++ b/src/utils/compose_executor.py @@ -0,0 +1,527 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import json +import logging +import os +import re +import shutil +import subprocess +import tempfile +from typing import Dict, List + +import yaml + +from src.utils.job import task as task_module +from src.utils.job import workflow as workflow_module +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + StandaloneExecutor, + TaskNode, + TaskResult, +) + + +logger = logging.getLogger(__name__) + +COMPOSE_FILE_NAME = 'docker-compose.yml' + + +class ComposeExecutor(StandaloneExecutor): + """ + Executes an OSMO workflow spec using Docker Compose for parallel task execution. + + Extends StandaloneExecutor with: + - True parallel execution of independent tasks within each scheduling wave + - {{host:taskname}} token support via Docker Compose DNS + - Shared network per task group for gang-scheduled communication + - GPU passthrough via compose deploy.resources.reservations + + Execution model: + Generates a single docker-compose.yml with all services defined up-front, + then executes them in waves. Each wave contains all tasks whose upstream + dependencies are satisfied. Tasks within a wave run in parallel via + ``docker compose up``. Group co-scheduling is enforced so that all members + of a multi-task group start together in the same wave. + """ + + def __init__(self, work_dir: str, keep_work_dir: bool = False, + compose_cmd: str = 'docker compose', shm_size: str | None = None): + super().__init__(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd='docker', shm_size=shm_size) + self._compose_cmd = compose_cmd + + @property + def _compose_file_path(self) -> str: + return os.path.join(self._work_dir, COMPOSE_FILE_NAME) + + def _compose_project_name(self, spec: workflow_module.WorkflowSpec) -> str: + return f'osmo-{re.sub(r"[^a-z0-9-]", "-", spec.name.lower())}' + + def _compose_base_cmd(self, spec: workflow_module.WorkflowSpec) -> List[str]: + return ( + self._compose_cmd.split() + + ['-p', self._compose_project_name(spec), '-f', self._compose_file_path] + ) + + # ------------------------------------------------------------------ + # Execution + # ------------------------------------------------------------------ + + def execute(self, spec: workflow_module.WorkflowSpec, + resume: bool = False, from_step: str | None = None) -> bool: + """Run all tasks in wave-parallel order via Docker Compose.""" + self._results.clear() + self._build_dag(spec) + self._validate_for_compose(spec) + self._setup_directories() + self._write_inline_files(spec) + self._generate_compose_file(spec) + + total_tasks = sum(len(g.tasks) for g in self._groups(spec)) + logger.info('Workflow "%s": %d task(s) across %d group(s) [docker-compose mode]', + spec.name, total_tasks, len(self._groups(spec))) + + try: + wave_number = 0 + while True: + wave = self._find_ready_wave() + if not wave: + break + + wave_number += 1 + logger.info('=== Wave %d: %s ===', wave_number, ', '.join(wave)) + + wave_results = self._run_wave(wave, spec) + + fatal_failure = False + for task_name, exit_code in wave_results.items(): + output_dir = os.path.join(self._work_dir, task_name, 'output') + self._results[task_name] = TaskResult( + name=task_name, exit_code=exit_code, output_dir=output_dir) + + if exit_code != 0: + if self._is_nonlead_failure_ignorable(task_name): + logger.warning( + 'Non-lead task "%s" failed with exit code %d ' + '(ignored — group "%s" has ignoreNonleadStatus=true)', + task_name, exit_code, self._task_nodes[task_name].group) + else: + logger.error('Task "%s" failed with exit code %d', + task_name, exit_code) + self._cancel_downstream(task_name) + fatal_failure = True + else: + logger.info('Task "%s" completed successfully', task_name) + + if fatal_failure: + return False + + unexecuted = set(self._task_nodes.keys()) - set(self._results.keys()) + if unexecuted: + logger.error( + 'Workflow "%s" stalled — tasks could not be scheduled ' + '(possible cycle or unsatisfiable group): %s', + spec.name, ', '.join(sorted(unexecuted))) + return False + + fatal_failures = [ + name for name, result in self._results.items() + if result.exit_code != 0 + and not self._is_nonlead_failure_ignorable(name) + ] + if fatal_failures: + logger.error('Workflow failed. Failed tasks: %s', + ', '.join(fatal_failures)) + return False + + logger.info('Workflow "%s" completed successfully', spec.name) + return True + finally: + self._compose_cleanup(spec) + + # ------------------------------------------------------------------ + # Validation + # ------------------------------------------------------------------ + + def _validate_for_compose(self, spec: workflow_module.WorkflowSpec): + """Reject cluster-only features while allowing {{host:}} tokens.""" + unsupported_features: List[str] = [] + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.DatasetInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": dataset inputs require object storage') + elif isinstance(input_source, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL inputs require network/storage access') + + for output in task_spec.outputs: + if isinstance(output, (task_module.DatasetInputOutput, + task_module.URLInputOutput)): + unsupported_features.append( + f'Task "{task_spec.name}": dataset/URL outputs require object storage') + + if task_spec.credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credentials require the OSMO secret manager') + + if task_spec.checkpoint: + unsupported_features.append( + f'Task "{task_spec.name}": checkpoints require object storage') + + if task_spec.volumeMounts: + unsupported_features.append( + f'Task "{task_spec.name}": volumeMounts require cluster-level host paths') + + if task_spec.privileged: + unsupported_features.append( + f'Task "{task_spec.name}": privileged containers are not ' + f'supported in docker-compose mode') + + if task_spec.hostNetwork: + unsupported_features.append( + f'Task "{task_spec.name}": hostNetwork is not supported ' + f'in docker-compose mode') + + self._validate_host_tokens(task_spec, group) + + if unsupported_features: + raise ValueError( + 'The following features are not supported in docker-compose ' + 'execution mode:\n - ' + + '\n - '.join(unsupported_features)) + + _HOST_TOKEN_NAME_PATTERN = re.compile(r'\{\{\s*host:(\S+)\s*\}\}') + + def _validate_host_tokens(self, task_spec: task_module.TaskSpec, + group: task_module.TaskGroupSpec): + """Ensure {{host:taskname}} tokens only reference tasks in the same group.""" + group_task_names = {t.name for t in group.tasks} + fields_to_check = list(task_spec.command) + list(task_spec.args) + fields_to_check += list(task_spec.environment.values()) + fields_to_check += [file_spec.contents for file_spec in task_spec.files] + + for field in fields_to_check: + for match in self._HOST_TOKEN_NAME_PATTERN.finditer(field): + referenced_task = match.group(1) + if referenced_task not in group_task_names: + raise ValueError( + f'Task "{task_spec.name}": {{{{host:{referenced_task}}}}} ' + f'references a task outside its group "{group.name}". ' + f'Host tokens can only reference tasks within the same group.') + + # ------------------------------------------------------------------ + # Token map (extended with {{host:taskname}}) + # ------------------------------------------------------------------ + + def _build_token_map(self, node: TaskNode) -> Dict[str, str]: + tokens = super()._build_token_map(node) + group_spec = self._group_specs[node.group] + for task_spec in group_spec.tasks: + tokens[f'host:{task_spec.name}'] = task_spec.name + return tokens + + # ------------------------------------------------------------------ + # Inline files + # ------------------------------------------------------------------ + + def _write_inline_files(self, spec: workflow_module.WorkflowSpec): + """Write all inline file specs to disk with token substitution.""" + for group in self._groups(spec): + for task_spec in group.tasks: + node = self._task_nodes[task_spec.name] + token_map = self._build_token_map(node) + files_dir = os.path.join(self._work_dir, task_spec.name, 'files') + os.makedirs(files_dir, exist_ok=True) + + for file_spec in task_spec.files: + resolved_contents = self._substitute_tokens( + file_spec.contents, token_map) + host_path = os.path.realpath( + os.path.join(files_dir, file_spec.path.lstrip('/'))) + if not host_path.startswith(os.path.realpath(files_dir) + os.sep): + raise ValueError( + f'Task "{task_spec.name}": file path ' + f'"{file_spec.path}" escapes the task directory') + os.makedirs(os.path.dirname(host_path), exist_ok=True) + with open(host_path, 'w', encoding='utf-8') as f: + f.write(resolved_contents) + + # ------------------------------------------------------------------ + # Compose file generation + # ------------------------------------------------------------------ + + def _generate_compose_file(self, spec: workflow_module.WorkflowSpec): + """Write a docker-compose.yml containing every task as a service.""" + compose: Dict = {'services': {}} + networks_needed: set = set() + + for task_name, node in self._task_nodes.items(): + service = self._build_compose_service(node, spec) + compose['services'][task_name] = service + networks_needed.add(node.group) + + if networks_needed: + compose['networks'] = { + name: {'driver': 'bridge'} + for name in sorted(networks_needed) + } + + with open(self._compose_file_path, 'w', encoding='utf-8') as f: + yaml.safe_dump(compose, f, default_flow_style=False, sort_keys=False) + + logger.info('Generated compose file: %s', self._compose_file_path) + + @staticmethod + def _escape_compose_interpolation(text: str) -> str: + """Escape ``$`` as ``$$`` so Docker Compose passes them literally to the container.""" + return text.replace('$', '$$') + + def _build_compose_service(self, node: TaskNode, + spec: workflow_module.WorkflowSpec) -> Dict: + """Build a single Docker Compose service definition for a task.""" + task_spec = node.spec + token_map = self._build_token_map(node) + + resolved_command = [ + self._substitute_tokens(c, token_map) for c in task_spec.command] + resolved_args = [ + self._substitute_tokens(a, token_map) for a in task_spec.args] + resolved_environment = { + key: self._substitute_tokens(value, token_map) + for key, value in task_spec.environment.items() + } + + all_resolved = ( + resolved_command + resolved_args + list(resolved_environment.values()) + + [self._substitute_tokens(f.contents, token_map) + for f in task_spec.files] + ) + self._check_unresolved_tokens(node.name, all_resolved) + + esc = self._escape_compose_interpolation + + service: Dict = {'image': task_spec.image} + + if resolved_command: + service['entrypoint'] = [esc(resolved_command[0])] + trailing = resolved_command[1:] + resolved_args + if trailing: + service['command'] = [esc(t) for t in trailing] + elif resolved_args: + service['command'] = [esc(a) for a in resolved_args] + + if resolved_environment: + service['environment'] = { + k: esc(v) for k, v in resolved_environment.items() + } + + volumes: List[str] = [] + task_dir = os.path.abspath(os.path.join(self._work_dir, node.name)) + output_dir = os.path.join(task_dir, 'output') + volumes.append(f'{output_dir}:{CONTAINER_DATA_PATH}/output') + + for index, input_source in enumerate(task_spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + upstream_output = os.path.abspath( + os.path.join(self._work_dir, input_source.task, 'output')) + volumes.append( + f'{upstream_output}:{CONTAINER_DATA_PATH}/input/{index}:ro') + + files_dir = os.path.join(task_dir, 'files') + for file_spec in task_spec.files: + host_path = os.path.realpath( + os.path.join(files_dir, file_spec.path.lstrip('/'))) + volumes.append(f'{host_path}:{file_spec.path}:ro') + + if volumes: + service['volumes'] = volumes + + service['networks'] = [node.group] + + gpu_count = self._task_gpu_count(task_spec, spec) + if gpu_count > 0: + service['deploy'] = { + 'resources': { + 'reservations': { + 'devices': [{ + 'driver': 'nvidia', + 'count': gpu_count, + 'capabilities': ['gpu'], + }] + } + } + } + service['shm_size'] = self._shm_size or self.DEFAULT_SHM_SIZE + elif self._shm_size: + service['shm_size'] = self._shm_size + + return service + + # ------------------------------------------------------------------ + # Wave scheduling + # ------------------------------------------------------------------ + + def _find_ready_wave(self) -> List[str]: + """ + Return the next batch of tasks to run in parallel. + + All members of a multi-task group are co-scheduled: a group is only + included when every unfinished member has its upstream dependencies + satisfied. If co-scheduling stalls (e.g. cross-group edges inside a + multi-task group), we fall back to plain task-level readiness to avoid + deadlocks. + """ + ready_tasks = self._find_ready_tasks() + if not ready_tasks: + return [] + + ready_set = set(ready_tasks) + + groups_with_ready: Dict[str, List[str]] = {} + for task_name in ready_tasks: + group = self._task_nodes[task_name].group + groups_with_ready.setdefault(group, []).append(task_name) + + wave: List[str] = [] + for group_name, group_ready in groups_with_ready.items(): + group_spec = self._group_specs[group_name] + all_members = {t.name for t in group_spec.tasks} + unfinished = all_members - set(self._results.keys()) + + if unfinished.issubset(ready_set): + wave.extend(sorted(unfinished)) + elif len(all_members) == 1: + wave.extend(group_ready) + + if not wave and ready_tasks: + wave = ready_tasks + + return wave + + # ------------------------------------------------------------------ + # Wave execution + # ------------------------------------------------------------------ + + def _run_wave(self, task_names: List[str], + spec: workflow_module.WorkflowSpec) -> Dict[str, int]: + """Start *task_names* in parallel and block until they all exit.""" + base_cmd = self._compose_base_cmd(spec) + + up_cmd = base_cmd + ['up', '--no-deps', '--no-log-prefix'] + list(task_names) + logger.debug('Compose command: %s', ' '.join(up_cmd)) + + try: + subprocess.run(up_cmd, check=False) + except FileNotFoundError: + logger.error( + 'Docker Compose not found. Is "%s" available in your PATH?', + self._compose_cmd) + return {name: 127 for name in task_names} + + results: Dict[str, int] = {} + for task_name in task_names: + results[task_name] = self._get_service_exit_code(task_name, spec) + + rm_cmd = base_cmd + ['rm', '-f'] + list(task_names) + subprocess.run(rm_cmd, capture_output=True, check=False) + + return results + + def _get_service_exit_code(self, service_name: str, + spec: workflow_module.WorkflowSpec) -> int: + """Query Docker Compose for the exit code of *service_name*.""" + ps_cmd = self._compose_base_cmd(spec) + [ + 'ps', '-a', '--format', 'json', service_name, + ] + try: + result = subprocess.run( + ps_cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + logger.warning('Failed to query exit code for "%s": %s', + service_name, result.stderr.strip()) + return 1 + + for line in result.stdout.strip().splitlines(): + line = line.strip() + if not line: + continue + try: + container_info = json.loads(line) + except json.JSONDecodeError: + continue + if isinstance(container_info, list): + for entry in container_info: + if entry.get('Service') == service_name: + return entry.get('ExitCode', 1) + elif container_info.get('Service') == service_name: + return container_info.get('ExitCode', 1) + + logger.warning('No container info found for service "%s"', service_name) + return 1 + except (subprocess.TimeoutExpired, FileNotFoundError): + logger.warning('Could not determine exit code for "%s"', service_name) + return 1 + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def _compose_cleanup(self, spec: workflow_module.WorkflowSpec): + """Tear down containers and networks created by Docker Compose.""" + down_cmd = self._compose_base_cmd(spec) + ['down', '--remove-orphans'] + try: + subprocess.run(down_cmd, capture_output=True, timeout=60, check=False) + except (subprocess.TimeoutExpired, FileNotFoundError): + logger.warning('Failed to clean up Docker Compose resources') + + +def run_workflow_compose(spec_path: str, work_dir: str | None = None, + keep_work_dir: bool = False, + compose_cmd: str = 'docker compose', + shm_size: str | None = None) -> bool: + """Load a workflow spec and execute it via Docker Compose.""" + with open(spec_path, encoding='utf-8') as f: + spec_text = f.read() + + template_markers = ('{%', '{#', 'default-values') + if any(marker in spec_text for marker in template_markers): + raise ValueError( + 'This spec uses Jinja templates which require server-side expansion.\n' + 'Run "osmo workflow submit --dry-run -f " first to get the ' + 'expanded spec,\nthen save that output and run it with docker-compose.') + + created_work_dir = work_dir is None + if work_dir is None: + work_dir = tempfile.mkdtemp(prefix='osmo-compose-') + logger.info('Using temporary work directory: %s', work_dir) + + executor = ComposeExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + compose_cmd=compose_cmd, shm_size=shm_size) + spec = executor.load_spec(spec_text) + success = executor.execute(spec) + + if created_work_dir and not keep_work_dir and success: + logger.info('Cleaning up work directory: %s', work_dir) + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) + + return success diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index b555f85bc..f6b8d375e 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -66,3 +66,13 @@ py_test( ], local = True, ) + +py_test( + name = "test_compose_executor", + srcs = ["test_compose_executor.py"], + deps = [ + "//src/utils:compose_executor", + "//src/utils:standalone_executor", + ], + local = True, +) diff --git a/src/utils/tests/test_compose_executor.py b/src/utils/tests/test_compose_executor.py new file mode 100644 index 000000000..e824d2b9a --- /dev/null +++ b/src/utils/tests/test_compose_executor.py @@ -0,0 +1,1073 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import json +import os +import shutil +import subprocess +import tempfile +import textwrap +import unittest +from unittest import mock + +import yaml + +from src.utils.compose_executor import ( + COMPOSE_FILE_NAME, + ComposeExecutor, + run_workflow_compose, +) +from src.utils.standalone_executor import CONTAINER_DATA_PATH, TaskResult + + +def _docker_compose_available() -> bool: + """Return True if Docker Compose V2 is available.""" + try: + result = subprocess.run( + ['docker', 'compose', 'version'], + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +DOCKER_COMPOSE_AVAILABLE = _docker_compose_available() +SKIP_COMPOSE_MSG = 'Docker Compose is not available on this machine' + + +# ============================================================================ +# Unit tests — no Docker required +# ============================================================================ + + +class TestComposeFileGeneration(unittest.TestCase): + """Verify that the generated docker-compose.yml matches the workflow spec.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-test-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _make_executor(self) -> ComposeExecutor: + return ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + + def _generate_and_load(self, spec_text: str) -> dict: + """Parse spec, build DAG, generate compose file, return parsed YAML.""" + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + executor._setup_directories() + executor._write_inline_files(spec) + executor._generate_compose_file(spec) + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + with open(compose_path, encoding='utf-8') as f: + return yaml.safe_load(f) + + def test_single_task_generates_one_service(self): + """A single-task workflow produces a compose file with one service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello + tasks: + - name: greet + image: alpine:3.18 + command: ["echo", "hello"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertIn('greet', compose['services']) + self.assertEqual(len(compose['services']), 1) + svc = compose['services']['greet'] + self.assertEqual(svc['image'], 'alpine:3.18') + self.assertEqual(svc['entrypoint'], ['echo']) + self.assertEqual(svc['command'], ['hello']) + + def test_parallel_tasks_generate_separate_services(self): + """Independent tasks produce separate services with no depends_on.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: task-a + image: alpine:3.18 + command: ["echo", "a"] + - name: task-b + image: alpine:3.18 + command: ["echo", "b"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertEqual(len(compose['services']), 2) + self.assertIn('task-a', compose['services']) + self.assertIn('task-b', compose['services']) + for svc in compose['services'].values(): + self.assertNotIn('depends_on', svc) + + def test_volumes_for_output(self): + """Each service has an output volume mapping to the host work directory.""" + spec_text = textwrap.dedent('''\ + workflow: + name: vol-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + output_volume = f'{os.path.abspath(os.path.join(self.work_dir, "task", "output"))}:{CONTAINER_DATA_PATH}/output' + self.assertIn(output_volume, svc['volumes']) + + def test_upstream_input_volumes(self): + """A consumer task mounts its upstream task's output as a read-only input.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + compose = self._generate_and_load(spec_text) + + consumer = compose['services']['consumer'] + upstream_output = os.path.abspath( + os.path.join(self.work_dir, 'producer', 'output')) + expected_volume = f'{upstream_output}:{CONTAINER_DATA_PATH}/input/0:ro' + self.assertIn(expected_volume, consumer['volumes']) + + def test_environment_variables_included(self): + """Environment variables from the spec appear in the compose service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: task + image: alpine:3.18 + command: ["printenv"] + environment: + FOO: bar + BAZ: "42" + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + self.assertEqual(svc['environment']['FOO'], 'bar') + self.assertEqual(svc['environment']['BAZ'], '42') + + def test_inline_files_mounted(self): + """Inline files are written to disk and bind-mounted into the service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: files-test + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: echo hello + path: /tmp/run.sh + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + file_volumes = [v for v in svc['volumes'] if '/tmp/run.sh:ro' in v] + self.assertEqual(len(file_volumes), 1) + + host_path = file_volumes[0].split(':')[0] + self.assertTrue(os.path.exists(host_path)) + with open(host_path, encoding='utf-8') as f: + self.assertEqual(f.read(), 'echo hello') + + def test_group_network_assigned(self): + """Tasks in a group share a compose network named after the group.""" + spec_text = textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertIn('workers', compose.get('networks', {})) + self.assertEqual(compose['services']['leader']['networks'], ['workers']) + self.assertEqual(compose['services']['follower']['networks'], ['workers']) + + def test_gpu_resources_in_compose(self): + """GPU tasks get deploy.resources.reservations.devices and shm_size.""" + spec_text = textwrap.dedent('''\ + workflow: + name: gpu-test + resources: + gpu-res: + gpu: 2 + tasks: + - name: train + image: pytorch:latest + resource: gpu-res + command: ["python", "train.py"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['train'] + devices = svc['deploy']['resources']['reservations']['devices'] + self.assertEqual(len(devices), 1) + self.assertEqual(devices[0]['driver'], 'nvidia') + self.assertEqual(devices[0]['count'], 2) + self.assertIn('gpu', devices[0]['capabilities']) + self.assertEqual(svc['shm_size'], '16g') + + def test_custom_shm_size(self): + """A user-specified shm_size overrides the default for GPU tasks.""" + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-res: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-res + command: ["python"] + ''') + executor = ComposeExecutor( + work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + executor._setup_directories() + executor._generate_compose_file(spec) + + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + with open(compose_path, encoding='utf-8') as f: + compose = yaml.safe_load(f) + self.assertEqual(compose['services']['train']['shm_size'], '32g') + + def test_non_gpu_task_no_deploy_section(self): + """A CPU-only task has no deploy section in the compose service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cpu-test + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + self.assertNotIn('deploy', compose['services']['preprocess']) + + def test_entrypoint_and_command_split(self): + """The task command is split into entrypoint (first element) and command (rest + args).""" + spec_text = textwrap.dedent('''\ + workflow: + name: split-test + tasks: + - name: task + image: alpine:3.18 + command: ["bash", "-c"] + args: ["echo hello"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + self.assertEqual(svc['entrypoint'], ['bash']) + self.assertEqual(svc['command'], ['-c', 'echo hello']) + + +class TestComposeTokenMap(unittest.TestCase): + """Verify that the token map includes {{host:taskname}} for same-group tasks.""" + + def test_host_tokens_for_group_members(self): + """Tasks in the same group get host tokens for all group members.""" + spec_text = textwrap.dedent('''\ + workflow: + name: host-tokens + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: worker-a + image: alpine:3.18 + command: ["echo"] + - name: worker-b + image: alpine:3.18 + command: ["echo"] + ''') + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + leader_node = executor._task_nodes['leader'] + tokens = executor._build_token_map(leader_node) + + self.assertEqual(tokens['host:leader'], 'leader') + self.assertEqual(tokens['host:worker-a'], 'worker-a') + self.assertEqual(tokens['host:worker-b'], 'worker-b') + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') + + def test_no_host_tokens_for_single_task_group(self): + """A single-task group still gets a host token for itself.""" + spec_text = textwrap.dedent('''\ + workflow: + name: single + tasks: + - name: solo + image: alpine:3.18 + command: ["echo"] + ''') + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + node = executor._task_nodes['solo'] + tokens = executor._build_token_map(node) + self.assertIn('host:solo', tokens) + + +class TestComposeValidation(unittest.TestCase): + """Verify compose-mode validation accepts host tokens but rejects cluster features.""" + + def _make_executor(self) -> ComposeExecutor: + return ComposeExecutor(work_dir='/tmp/unused') + + def test_host_tokens_accepted(self): + """Specs with {{host:taskname}} tokens pass compose validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: host-ok + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + args: ["--peer={{host:follower}}"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + + def test_host_token_cross_group_rejected(self): + """A {{host:taskname}} that references a task in another group is rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cross-group + groups: + - name: group-a + tasks: + - name: task-a + lead: true + image: alpine:3.18 + command: ["echo"] + args: ["--peer={{host:task-b}}"] + - name: group-b + tasks: + - name: task-b + lead: true + image: alpine:3.18 + command: ["echo"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('host:task-b', str(context.exception)) + self.assertIn('outside its group', str(context.exception)) + + def test_dataset_input_rejected(self): + """Dataset inputs are still rejected in compose mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('dataset', str(context.exception)) + + def test_credentials_rejected(self): + """Credentials are rejected in compose mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('credentials', str(context.exception)) + + def test_simple_spec_passes(self): + """A simple spec with only task-to-task inputs passes compose validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + + +class TestFindReadyWave(unittest.TestCase): + """Verify the group-aware wave scheduling logic.""" + + def _make_executor(self, spec_text: str) -> ComposeExecutor: + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + return executor + + def test_all_independent_tasks_in_one_wave(self): + """All independent tasks appear in the first wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + - name: c + image: alpine:3.18 + command: ["echo"] + ''')) + wave = executor._find_ready_wave() + self.assertEqual(set(wave), {'a', 'b', 'c'}) + + def test_serial_chain_one_per_wave(self): + """A serial chain yields one task per wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''')) + + wave1 = executor._find_ready_wave() + self.assertEqual(wave1, ['first']) + + executor._results['first'] = TaskResult( + name='first', exit_code=0, output_dir='/tmp/out') + wave2 = executor._find_ready_wave() + self.assertEqual(wave2, ['second']) + + def test_multi_task_group_co_scheduled(self): + """All tasks in a multi-task group appear in the same wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''')) + wave = executor._find_ready_wave() + self.assertEqual(set(wave), {'leader', 'follower'}) + + def test_diamond_dag_waves(self): + """A diamond DAG produces three waves: root, fan-out, fan-in.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''')) + + wave1 = executor._find_ready_wave() + self.assertEqual(wave1, ['root']) + + executor._results['root'] = TaskResult( + name='root', exit_code=0, output_dir='/tmp/out') + wave2 = executor._find_ready_wave() + self.assertEqual(set(wave2), {'left', 'right'}) + + executor._results['left'] = TaskResult( + name='left', exit_code=0, output_dir='/tmp/out') + executor._results['right'] = TaskResult( + name='right', exit_code=0, output_dir='/tmp/out') + wave3 = executor._find_ready_wave() + self.assertEqual(wave3, ['join']) + + def test_empty_wave_when_all_done(self): + """An empty wave is returned when all tasks have completed.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: done + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''')) + executor._results['task'] = TaskResult( + name='task', exit_code=0, output_dir='/tmp/out') + wave = executor._find_ready_wave() + self.assertEqual(wave, []) + + +class TestComposeProjectName(unittest.TestCase): + """Verify the Docker Compose project name generation.""" + + def test_simple_name(self): + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(textwrap.dedent('''\ + workflow: + name: my-workflow + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + ''')) + self.assertEqual(executor._compose_project_name(spec), 'osmo-my-workflow') + + def test_name_with_special_chars(self): + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(textwrap.dedent('''\ + workflow: + name: my-workflow + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + ''')) + project = executor._compose_project_name(spec) + self.assertTrue(project.startswith('osmo-')) + self.assertRegex(project, r'^[a-z0-9-]+$') + + +class TestJinjaTemplateDetection(unittest.TestCase): + """Verify that Jinja templates are rejected before execution.""" + + def _write_temp_spec(self, content: str) -> str: + f = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) + f.write(content) + f.flush() + f.close() + return f.name + + def test_jinja_block_detected(self): + path = self._write_temp_spec(textwrap.dedent('''\ + workflow: + name: {% if true %}test{% endif %} + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''')) + try: + with self.assertRaises(ValueError) as context: + run_workflow_compose(path) + self.assertIn('Jinja', str(context.exception)) + finally: + os.unlink(path) + + def test_default_values_detected(self): + path = self._write_temp_spec(textwrap.dedent('''\ + workflow: + name: "{{experiment}}" + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + default-values: + experiment: test + ''')) + try: + with self.assertRaises(ValueError) as context: + run_workflow_compose(path) + self.assertIn('Jinja', str(context.exception)) + finally: + os.unlink(path) + + +class TestUnresolvedTokenDetection(unittest.TestCase): + """Verify that unresolved tokens are caught during compose file generation.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-tokens-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_unresolved_jinja_variable_caught(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "{{missing_var}}"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('missing_var', str(context.exception)) + + +class TestPathTraversal(unittest.TestCase): + """Verify that file path traversal is prevented.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-traversal-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_path_traversal_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "malicious" + path: /../../etc/evil.conf + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('escapes the task directory', str(context.exception)) + + +class TestRunWorkflowComposeErrors(unittest.TestCase): + """Test error handling in run_workflow_compose().""" + + def test_nonexistent_file_raises(self): + with self.assertRaises(FileNotFoundError): + run_workflow_compose(spec_path='/nonexistent/path/spec.yaml') + + +# ============================================================================ +# Integration tests — require Docker Compose +# ============================================================================ + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestComposeExecution(unittest.TestCase): + """Integration tests that run workflows through Docker Compose.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-test-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _execute_spec(self, spec_text: str) -> bool: + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + return executor.execute(spec) + + def test_hello_world(self): + """Run a minimal single-task workflow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello-compose + tasks: + - name: hello + image: alpine:3.18 + command: ["echo", "Hello from Docker Compose!"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_parallel_independent_tasks(self): + """Independent tasks all execute and produce their outputs.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel-compose + tasks: + - name: task-a + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'a' > {{output}}/marker.txt"] + - name: task-b + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'b' > {{output}}/marker.txt"] + - name: task-c + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'c' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + for task_name, expected in [('task-a', 'a'), ('task-b', 'b'), ('task-c', 'c')]: + marker = os.path.join(self.work_dir, task_name, 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), expected) + + def test_serial_data_flow(self): + """Data written by a producer is readable by a consumer.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-compose + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'from_producer' > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'from_producer') + + def test_diamond_dag(self): + """A diamond DAG executes with correct data flow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond-compose + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'root_data' > {{output}}/base.txt"] + - name: left + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'left:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'right:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/result.txt > {{output}}/final.txt && cat {{input:1}}/result.txt >> {{output}}/final.txt"] + inputs: + - task: left + - task: right + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'join', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('left:', content) + self.assertIn('right:', content) + self.assertIn('root_data', content) + + def test_failure_cancels_downstream(self): + """A failed task prevents downstream dependents from running.""" + spec_text = textwrap.dedent('''\ + workflow: + name: fail-compose + tasks: + - name: failing + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: should-not-run + image: alpine:3.18 + command: ["sh", "-c", "echo oops > {{output}}/bad.txt"] + inputs: + - task: failing + ''') + self.assertFalse(self._execute_spec(spec_text)) + output_file = os.path.join( + self.work_dir, 'should-not-run', 'output', 'bad.txt') + self.assertFalse(os.path.exists(output_file)) + + def test_environment_variables(self): + """Environment variables are passed to compose containers.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-compose + tasks: + - name: check-env + image: alpine:3.18 + command: ["sh", "-c"] + args: ["test \\"$MY_VAR\\" = \\"hello\\" && echo ok > {{output}}/result.txt"] + environment: + MY_VAR: hello + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_inline_file_mounted(self): + """An inline file is written and mounted into the container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: files-compose + tasks: + - name: check-file + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "script ran" > {{output}}/result.txt + path: /tmp/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + result = os.path.join(self.work_dir, 'check-file', 'output', 'result.txt') + with open(result) as f: + self.assertIn('script ran', f.read()) + + def test_compose_file_preserved(self): + """The generated docker-compose.yml is kept in the work directory.""" + spec_text = textwrap.dedent('''\ + workflow: + name: preserve-compose + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + self._execute_spec(spec_text) + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + self.assertTrue(os.path.exists(compose_path)) + + def test_groups_with_data_flow(self): + """Groups with inter-group data dependencies execute correctly.""" + spec_text = textwrap.dedent('''\ + workflow: + name: group-flow-compose + groups: + - name: prepare + tasks: + - name: generate + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + mkdir -p {{output}}/data + for i in 1 2 3; do echo "sample_$i" >> {{output}}/data/dataset.csv; done + - name: train + tasks: + - name: trainer + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + wc -l {{input:0}}/data/dataset.csv > {{output}}/count.txt + inputs: + - task: generate + ''') + self.assertTrue(self._execute_spec(spec_text)) + count_file = os.path.join(self.work_dir, 'trainer', 'output', 'count.txt') + with open(count_file) as f: + self.assertIn('3', f.read()) + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestComposeLeadTaskPolicy(unittest.TestCase): + """Verify ignoreNonleadStatus behavior in compose mode.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-lead-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_nonlead_failure_ignored_when_flag_true(self): + """With ignoreNonleadStatus=true, a non-lead failure does not abort the workflow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy-compose + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + def test_lead_failure_aborts_workflow(self): + """A lead task failure aborts the workflow even with ignoreNonleadStatus=true.""" + spec_text = textwrap.dedent('''\ + workflow: + name: lead-fail-compose + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: follower + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestRunWorkflowCompose(unittest.TestCase): + """Test the top-level run_workflow_compose() function.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-func-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_success_with_work_dir(self): + """A successful run preserves the caller-supplied work directory.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: func-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_compose( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertTrue(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + def test_failure_preserves_work_dir(self): + """On failure, the work directory is preserved.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: fail-func + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''')) + spec_path = f.name + try: + result = run_workflow_compose( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=False, + ) + self.assertFalse(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + +if __name__ == '__main__': + unittest.main() From fd1a794c3be8897abc1e147fe5a8b62f013bd4fe Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 11:51:28 -0700 Subject: [PATCH 23/35] Enhance Docker Compose and Standalone Executors with error handling and cleanup - Added error handling in `ComposeExecutor` to raise `NotImplementedError` for unsupported resume functionality. - Refactored `run_workflow_compose` and `run_workflow_standalone` to ensure proper cleanup of temporary work directories using a `try-finally` block. - Introduced validation in `StandaloneExecutor` to check for correct YAML mapping in workflow specifications, raising a `ValueError` for invalid formats. - Updated build configurations to include the new `docker_compose.py` file and the `compose_executor` library. --- src/cli/BUILD | 2 ++ src/utils/compose_executor.py | 25 +++++++++++++++---------- src/utils/standalone_executor.py | 27 ++++++++++++++++----------- 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/src/cli/BUILD b/src/cli/BUILD index eade2ea71..9789a9923 100755 --- a/src/cli/BUILD +++ b/src/cli/BUILD @@ -37,6 +37,7 @@ osmo_py_library( "dataset.py", "editor.py", "formatters.py", + "docker_compose.py", "standalone.py", "login.py", "main_parser.py", @@ -74,6 +75,7 @@ osmo_py_library( "//src/lib/utils:validation", "//src/lib/utils:version", "//src/lib/utils:workflow", + "//src/utils:compose_executor", "//src/utils:standalone_executor", ], ) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index 928ddcada..f1ff8d519 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -86,6 +86,10 @@ def _compose_base_cmd(self, spec: workflow_module.WorkflowSpec) -> List[str]: def execute(self, spec: workflow_module.WorkflowSpec, resume: bool = False, from_step: str | None = None) -> bool: """Run all tasks in wave-parallel order via Docker Compose.""" + if resume or from_step: + raise NotImplementedError( + 'docker-compose mode does not support --resume or --from-step yet. ' + 'Use standalone mode for resume functionality.') self._results.clear() self._build_dag(spec) self._validate_for_compose(spec) @@ -513,15 +517,16 @@ def run_workflow_compose(spec_path: str, work_dir: str | None = None, work_dir = tempfile.mkdtemp(prefix='osmo-compose-') logger.info('Using temporary work directory: %s', work_dir) - executor = ComposeExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - compose_cmd=compose_cmd, shm_size=shm_size) - spec = executor.load_spec(spec_text) - success = executor.execute(spec) - - if created_work_dir and not keep_work_dir and success: - logger.info('Cleaning up work directory: %s', work_dir) - shutil.rmtree(work_dir, ignore_errors=True) - elif not success: - logger.info('Work directory preserved for debugging: %s', work_dir) + success = False + try: + executor = ComposeExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + compose_cmd=compose_cmd, shm_size=shm_size) + spec = executor.load_spec(spec_text) + success = executor.execute(spec) + finally: + if created_work_dir and not keep_work_dir: + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) return success diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 8f4493140..588572d3f 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -118,6 +118,10 @@ def _detect_available_gpus(self) -> int: def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: """Parse raw YAML text into a validated WorkflowSpec via the versioned spec model.""" raw = yaml.safe_load(spec_text) + if not isinstance(raw, dict): + raise ValueError( + f'Expected a YAML mapping for the workflow spec, ' + f'got {type(raw).__name__}') versioned = workflow_module.VersionedWorkflowSpec(**raw) return versioned.workflow @@ -569,16 +573,17 @@ def run_workflow_standalone(spec_path: str, work_dir: str | None = None, work_dir = tempfile.mkdtemp(prefix='osmo-standalone-') logger.info('Using temporary work directory: %s', work_dir) - executor = StandaloneExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - docker_cmd=docker_cmd, shm_size=shm_size) - spec = executor.load_spec(spec_text) - success = executor.execute(spec, resume=resume or from_step is not None, - from_step=from_step) - - if created_work_dir and not keep_work_dir and success: - logger.info('Cleaning up work directory: %s', work_dir) - shutil.rmtree(work_dir, ignore_errors=True) - elif not success: - logger.info('Work directory preserved for debugging: %s', work_dir) + success = False + try: + executor = StandaloneExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd=docker_cmd, shm_size=shm_size) + spec = executor.load_spec(spec_text) + success = executor.execute(spec, resume=resume or from_step is not None, + from_step=from_step) + finally: + if created_work_dir and not keep_work_dir: + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) return success From 47ff70fcb5dead2f4ba133731462dd33ab46bcbe Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 12:03:08 -0700 Subject: [PATCH 24/35] Add cleanup functionality in StandaloneExecutor for rerun tasks - Introduced a new method `_clean_rerun_output_dirs` to remove output directories for tasks that will be re-executed, ensuring no stale artifacts remain. - Updated the `run_workflow_standalone` method to call the cleanup function when resuming or starting from a specific step. - Adjusted GPU device specification formatting in Docker arguments to ensure correct syntax with quotes around device lists. --- src/utils/standalone_executor.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 588572d3f..a33dd9b84 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -135,6 +135,7 @@ def execute(self, spec: workflow_module.WorkflowSpec, if resume or from_step: self._restore_completed_tasks(from_step) + self._clean_rerun_output_dirs() total_tasks = sum(len(g.tasks) for g in self._groups(spec)) skipped = len(self._results) @@ -250,6 +251,16 @@ def _get_downstream_tasks(self, task_name: str) -> Set[str]: queue.append(downstream) return visited + def _clean_rerun_output_dirs(self): + """Remove output directories for tasks that will be re-executed so no stale artifacts remain.""" + tasks_to_rerun = set(self._task_nodes.keys()) - set(self._results.keys()) + for task_name in tasks_to_rerun: + output_dir = os.path.join(self._work_dir, task_name, 'output') + if os.path.isdir(output_dir): + shutil.rmtree(output_dir) + os.makedirs(output_dir, exist_ok=True) + logger.debug('Cleaned output directory for task "%s"', task_name) + def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGroupSpec]: """Return the spec's groups, or synthesize one group per task when groups are absent.""" if spec.groups: @@ -417,6 +428,8 @@ def _cancel_downstream(self, failed_task: str): def _task_gpu_count(self, task_spec: task_module.TaskSpec, spec: workflow_module.WorkflowSpec) -> int: """Return the number of GPUs requested by a task's resource spec, defaulting to 0.""" + if task_spec.resources.gpu: + return task_spec.resources.gpu resource_spec = spec.resources.get(task_spec.resource) if resource_spec and resource_spec.gpu: return resource_spec.gpu @@ -463,9 +476,9 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR logger.warning( 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', node.name, gpu_count, available, available) - docker_args += ['--gpus', f'device={",".join(str(i) for i in range(available))}'] + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(available))}"'] else: - docker_args += ['--gpus', f'device={",".join(str(i) for i in range(gpu_count))}'] + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) docker_args += ['--shm-size', self._shm_size or self.DEFAULT_SHM_SIZE] From dbf479056a21230ae13951cebb5acdba6555fd61 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 16:09:57 -0700 Subject: [PATCH 25/35] Enhance CLI functionality with credential mapping and variable overrides - Added support for `--credential`, `--set`, and `--set-string` arguments in both `docker_compose.py` and `standalone.py` to allow users to map credential names to local directories and override default values in workflow specifications. - Implemented `_parse_credentials` function to validate and parse credential inputs. - Updated `ComposeExecutor` and `StandaloneExecutor` to handle new credential and variable parameters, ensuring proper binding of directories and values during execution. - Enhanced error handling for unsupported credential configurations in workflow specifications. --- .gitignore | 4 +- src/cli/docker_compose.py | 42 +++++++++++ src/cli/standalone.py | 42 +++++++++++ src/utils/BUILD | 2 + src/utils/compose_executor.py | 50 +++++++++----- src/utils/standalone_executor.py | 115 ++++++++++++++++++++++++++----- 6 files changed, 218 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index b84388d41..f5180c6e9 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,6 @@ docs/**/domain_config.js .lycheecache -.venv/ \ No newline at end of file +.venv/ +build/ +*.egg-info \ No newline at end of file diff --git a/src/cli/docker_compose.py b/src/cli/docker_compose.py index ab44e4f62..cde0c173f 100644 --- a/src/cli/docker_compose.py +++ b/src/cli/docker_compose.py @@ -17,6 +17,7 @@ """ import argparse +import os import sys import shtab @@ -58,6 +59,28 @@ def setup_parser(parser: argparse._SubParsersAction): default='docker compose', help='Docker Compose command to use (e.g. "docker-compose" for V1). ' 'Default: "docker compose".') + run_parser.add_argument( + '--credential', + nargs='+', + default=[], + help='Map credential names to local directories. ' + 'Format: "=". The directory is bind-mounted read-only ' + 'into the container at the path declared in the spec. ' + 'Example: --credential hf-token=$HOME/.hf') + run_parser.add_argument( + '--set', + nargs='+', + default=[], + help='Override default-values in the workflow spec. ' + 'Format: "=". Values are cast as int or float if ' + 'applicable, otherwise kept as strings.') + run_parser.add_argument( + '--set-string', + dest='set_string', + nargs='+', + default=[], + help='Override default-values in the workflow spec, forcing string type. ' + 'Format: "=".') run_parser.add_argument( '--shm-size', dest='shm_size', @@ -67,15 +90,34 @@ def setup_parser(parser: argparse._SubParsersAction): run_parser.set_defaults(func=_run_compose) +def _parse_credentials(raw_credentials: list[str]) -> dict[str, str]: + """Parse --credential name=path arguments into a dict.""" + result: dict[str, str] = {} + for entry in raw_credentials: + if '=' not in entry: + raise ValueError( + f'--credential value "{entry}" is incorrectly formatted (expected name=/path)') + name, path = entry.split('=', 1) + if not os.path.isdir(path): + raise ValueError( + f'Credential path for "{name}" does not exist or is not a directory: {path}') + result[name] = path + return result + + def _run_compose(service_client, args: argparse.Namespace): """Execute a workflow via Docker Compose using the parsed CLI arguments.""" try: + credentials = _parse_credentials(args.credential) success = compose_executor.run_workflow_compose( spec_path=args.workflow_file, work_dir=args.work_dir, keep_work_dir=args.keep, compose_cmd=args.compose_cmd, shm_size=args.shm_size, + set_variables=args.set, + set_string_variables=args.set_string, + credentials=credentials, ) except (ValueError, FileNotFoundError, PermissionError) as error: print(f'Error: {error}', file=sys.stderr) diff --git a/src/cli/standalone.py b/src/cli/standalone.py index d30e764ff..8c3c200f7 100644 --- a/src/cli/standalone.py +++ b/src/cli/standalone.py @@ -17,6 +17,7 @@ """ import argparse +import os import sys import shtab @@ -68,6 +69,28 @@ def setup_parser(parser: argparse._SubParsersAction): help='Resume from a specific task, re-running it and all downstream tasks. ' 'Tasks upstream of the specified step are skipped if they completed ' 'successfully. Requires --work-dir pointing to the previous run directory.') + run_parser.add_argument( + '--credential', + nargs='+', + default=[], + help='Map credential names to local directories. ' + 'Format: "=". The directory is bind-mounted read-only ' + 'into the container at the path declared in the spec. ' + 'Example: --credential hf-token=$HOME/.hf') + run_parser.add_argument( + '--set', + nargs='+', + default=[], + help='Override default-values in the workflow spec. ' + 'Format: "=". Values are cast as int or float if ' + 'applicable, otherwise kept as strings.') + run_parser.add_argument( + '--set-string', + dest='set_string', + nargs='+', + default=[], + help='Override default-values in the workflow spec, forcing string type. ' + 'Format: "=".') run_parser.add_argument( '--shm-size', dest='shm_size', @@ -78,9 +101,25 @@ def setup_parser(parser: argparse._SubParsersAction): run_parser.set_defaults(func=_run_standalone) +def _parse_credentials(raw_credentials: list[str]) -> dict[str, str]: + """Parse --credential name=path arguments into a dict.""" + result: dict[str, str] = {} + for entry in raw_credentials: + if '=' not in entry: + raise ValueError( + f'--credential value "{entry}" is incorrectly formatted (expected name=/path)') + name, path = entry.split('=', 1) + if not os.path.isdir(path): + raise ValueError( + f'Credential path for "{name}" does not exist or is not a directory: {path}') + result[name] = path + return result + + def _run_standalone(service_client, args: argparse.Namespace): """Execute a workflow in standalone mode via Docker using the parsed CLI arguments.""" try: + credentials = _parse_credentials(args.credential) success = standalone_executor.run_workflow_standalone( spec_path=args.workflow_file, work_dir=args.work_dir, @@ -89,6 +128,9 @@ def _run_standalone(service_client, args: argparse.Namespace): from_step=args.from_step, docker_cmd=args.docker_cmd, shm_size=args.shm_size, + set_variables=args.set, + set_string_variables=args.set_string, + credentials=credentials, ) except (ValueError, FileNotFoundError, PermissionError) as error: print(f'Error: {error}', file=sys.stderr) diff --git a/src/utils/BUILD b/src/utils/BUILD index 3e5eaae19..a6674a2f3 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -131,7 +131,9 @@ osmo_py_library( name = "standalone_executor", srcs = ["standalone_executor.py"], deps = [ + requirement("jinja2"), requirement("pyyaml"), + "//src/lib/utils:workflow", "//src/utils/job", ], visibility = ["//visibility:public"], diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index f1ff8d519..bfe718841 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -34,6 +34,8 @@ StandaloneExecutor, TaskNode, TaskResult, + _expand_jinja_locally, + _spec_has_templates, ) @@ -61,9 +63,11 @@ class ComposeExecutor(StandaloneExecutor): """ def __init__(self, work_dir: str, keep_work_dir: bool = False, - compose_cmd: str = 'docker compose', shm_size: str | None = None): + compose_cmd: str = 'docker compose', shm_size: str | None = None, + credentials: Dict[str, str] | None = None): super().__init__(work_dir=work_dir, keep_work_dir=keep_work_dir, - docker_cmd='docker', shm_size=shm_size) + docker_cmd='docker', shm_size=shm_size, + credentials=credentials) self._compose_cmd = compose_cmd @property @@ -177,14 +181,20 @@ def _validate_for_compose(self, spec: workflow_module.WorkflowSpec): f'Task "{task_spec.name}": URL inputs require network/storage access') for output in task_spec.outputs: - if isinstance(output, (task_module.DatasetInputOutput, - task_module.URLInputOutput)): + if isinstance(output, task_module.URLInputOutput): unsupported_features.append( - f'Task "{task_spec.name}": dataset/URL outputs require object storage') - - if task_spec.credentials: - unsupported_features.append( - f'Task "{task_spec.name}": credentials require the OSMO secret manager') + f'Task "{task_spec.name}": URL outputs require object storage') + elif isinstance(output, task_module.DatasetInputOutput): + logger.info( + 'Task "%s": dataset output "%s" ignored in docker-compose mode ' + '— data is available in the work directory', + task_spec.name, output.dataset.name) + + for cred_name in task_spec.credentials: + if cred_name not in self._credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' + f'Use --credential {cred_name}=/path/to/dir') if task_spec.checkpoint: unsupported_features.append( @@ -355,6 +365,11 @@ def _build_compose_service(self, node: TaskNode, os.path.join(files_dir, file_spec.path.lstrip('/'))) volumes.append(f'{host_path}:{file_spec.path}:ro') + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, str) and cred_name in self._credentials: + local_dir = os.path.abspath(self._credentials[cred_name]) + volumes.append(f'{local_dir}:{cred_mount}:ro') + if volumes: service['volumes'] = volumes @@ -500,17 +515,17 @@ def _compose_cleanup(self, spec: workflow_module.WorkflowSpec): def run_workflow_compose(spec_path: str, work_dir: str | None = None, keep_work_dir: bool = False, compose_cmd: str = 'docker compose', - shm_size: str | None = None) -> bool: + shm_size: str | None = None, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None, + credentials: Dict[str, str] | None = None) -> bool: """Load a workflow spec and execute it via Docker Compose.""" with open(spec_path, encoding='utf-8') as f: spec_text = f.read() - template_markers = ('{%', '{#', 'default-values') - if any(marker in spec_text for marker in template_markers): - raise ValueError( - 'This spec uses Jinja templates which require server-side expansion.\n' - 'Run "osmo workflow submit --dry-run -f " first to get the ' - 'expanded spec,\nthen save that output and run it with docker-compose.') + if _spec_has_templates(spec_text): + logger.info('Spec contains Jinja templates — expanding locally') + spec_text = _expand_jinja_locally(spec_text, set_variables, set_string_variables) created_work_dir = work_dir is None if work_dir is None: @@ -520,7 +535,8 @@ def run_workflow_compose(spec_path: str, work_dir: str | None = None, success = False try: executor = ComposeExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - compose_cmd=compose_cmd, shm_size=shm_size) + compose_cmd=compose_cmd, shm_size=shm_size, + credentials=credentials) spec = executor.load_spec(spec_text) success = executor.execute(spec) finally: diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index a33dd9b84..1d8f20d29 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -17,6 +17,7 @@ """ import dataclasses +import hashlib import json import logging import os @@ -24,10 +25,13 @@ import shutil import subprocess import tempfile -from typing import Dict, List, Set +from typing import Any, Dict, List, Set +import jinja2 +import jinja2.sandbox import yaml +from src.lib.utils import workflow as workflow_utils from src.utils.job import task as task_module from src.utils.job import workflow as workflow_module @@ -81,12 +85,14 @@ class StandaloneExecutor: DEFAULT_SHM_SIZE = '16g' def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker', - shm_size: str | None = None): + shm_size: str | None = None, + credentials: Dict[str, str] | None = None): """Initialize the executor with a work directory, cleanup preference, and container runtime command.""" self._work_dir = work_dir self._keep_work_dir = keep_work_dir self._docker_cmd = docker_cmd self._shm_size = shm_size + self._credentials = credentials or {} self._task_nodes: Dict[str, TaskNode] = {} self._group_specs: Dict[str, task_module.TaskGroupSpec] = {} self._results: Dict[str, TaskResult] = {} @@ -326,7 +332,7 @@ def visit(name: str) -> List[str] | None: _HOST_TOKEN_PATTERN = re.compile(r'\{\{\s*host:[^}]+\}\}') def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): - """Raise ValueError if the spec uses features unsupported in standalone mode (datasets, URLs, credentials, etc.).""" + """Raise ValueError if the spec uses features unsupported in standalone mode.""" unsupported_features = [] for group in self._groups(spec): for task_spec in group.tasks: @@ -339,13 +345,20 @@ def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): f'Task "{task_spec.name}": URL inputs require network/storage access') for output in task_spec.outputs: - if isinstance(output, (task_module.DatasetInputOutput, task_module.URLInputOutput)): + if isinstance(output, task_module.URLInputOutput): unsupported_features.append( - f'Task "{task_spec.name}": dataset/URL outputs require object storage') - - if task_spec.credentials: - unsupported_features.append( - f'Task "{task_spec.name}": credentials require the OSMO secret manager') + f'Task "{task_spec.name}": URL outputs require object storage') + elif isinstance(output, task_module.DatasetInputOutput): + logger.info( + 'Task "%s": dataset output "%s" ignored in standalone mode ' + '— data is available in the work directory', + task_spec.name, output.dataset.name) + + for cred_name in task_spec.credentials: + if cred_name not in self._credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' + f'Use --credential {cred_name}=/path/to/dir') if task_spec.checkpoint: unsupported_features.append( @@ -499,6 +512,11 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, str) and cred_name in self._credentials: + local_dir = os.path.abspath(self._credentials[cred_name]) + docker_args += ['-v', f'{local_dir}:{cred_mount}:ro'] + if resolved_command: docker_args += ['--entrypoint', resolved_command[0]] docker_args.append(task_spec.image) @@ -556,8 +574,66 @@ def _check_unresolved_tokens(self, task_name: str, resolved_fields: List[str]): if unresolved: raise ValueError( f'Task "{task_name}" has unresolved token(s): {", ".join(unresolved)}. ' - f'If this spec uses Jinja templates, run "osmo workflow submit --dry-run -f " ' - f'first to expand them.') + f'Use --set to provide values, or check for typos in template variable names.') + + +_OSMO_TOKEN_PATTERN = re.compile(r'\{\{(uuid|workflow_id|output|input:[^}]+|host:[^}]+)\}\}') + + +def _expand_jinja_locally(spec_text: str, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None) -> str: + """Expand Jinja templates in a workflow spec using its default-values section and CLI overrides. + + Mirrors the server-side logic in TemplateSpec.load_template_with_variables but runs + entirely locally: no PostgreSQL, no sandboxed worker pool. OSMO-specific tokens + ({{output}}, {{input:...}}, {{host:...}}, {{uuid}}, {{workflow_id}}) are protected + from expansion and restored afterward. + """ + file_text, default_values = workflow_utils.parse_workflow_spec(spec_text) + template_data: Dict[str, Any] = {} + if default_values: + template_data = default_values + + for data in (set_variables or []): + if '=' not in data: + raise ValueError(f'--set value "{data}" is incorrectly formatted (expected key=value)') + key, raw_value = data.split('=', 1) + try: + template_data[key] = int(raw_value) + except ValueError: + try: + template_data[key] = float(raw_value) + except ValueError: + template_data[key] = raw_value + + for data in (set_string_variables or []): + if '=' not in data: + raise ValueError( + f'--set-string value "{data}" is incorrectly formatted (expected key=value)') + key, raw_value = data.split('=', 1) + template_data[key] = raw_value + + placeholder_map: Dict[str, str] = {} + for match in _OSMO_TOKEN_PATTERN.finditer(file_text): + field = match.group(1).strip() + hash_key = 'hash' + str(int(hashlib.md5(field.encode('utf-8')).hexdigest(), 16)) + original_token = '{{' + match.group(1) + '}}' + template_data[hash_key] = original_token + placeholder_map[original_token] = hash_key + + protected_text = file_text + for original_token, hash_key in placeholder_map.items(): + protected_text = protected_text.replace(original_token, '{{' + hash_key + '}}') + + jinja_env = jinja2.sandbox.SandboxedEnvironment(undefined=jinja2.StrictUndefined) + template = jinja_env.from_string(protected_text) + return template.render(template_data) + + +def _spec_has_templates(spec_text: str) -> bool: + """Return True if the spec contains Jinja template markers that need expansion.""" + return any(marker in spec_text for marker in ('{%', '{#', 'default-values')) def run_workflow_standalone(spec_path: str, work_dir: str | None = None, @@ -565,7 +641,10 @@ def run_workflow_standalone(spec_path: str, work_dir: str | None = None, resume: bool = False, from_step: str | None = None, docker_cmd: str = 'docker', - shm_size: str | None = None) -> bool: + shm_size: str | None = None, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None, + credentials: Dict[str, str] | None = None) -> bool: """Load a workflow spec from disk and execute it in standalone mode via Docker, managing the work directory lifecycle.""" if (resume or from_step) and work_dir is None: raise ValueError( @@ -574,12 +653,9 @@ def run_workflow_standalone(spec_path: str, work_dir: str | None = None, with open(spec_path, encoding='utf-8') as f: spec_text = f.read() - template_markers = ('{%', '{#', 'default-values') - if any(marker in spec_text for marker in template_markers): - raise ValueError( - 'This spec uses Jinja templates which require server-side expansion.\n' - 'Run "osmo workflow submit --dry-run -f " first to get the expanded spec,\n' - 'then save that output and run it standalone.') + if _spec_has_templates(spec_text): + logger.info('Spec contains Jinja templates — expanding locally') + spec_text = _expand_jinja_locally(spec_text, set_variables, set_string_variables) created_work_dir = work_dir is None if work_dir is None: @@ -589,7 +665,8 @@ def run_workflow_standalone(spec_path: str, work_dir: str | None = None, success = False try: executor = StandaloneExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, - docker_cmd=docker_cmd, shm_size=shm_size) + docker_cmd=docker_cmd, shm_size=shm_size, + credentials=credentials) spec = executor.load_spec(spec_text) success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) From 612fa2fe19b889f6a04f497ddba08fe19f7b91ff Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 17:05:40 -0700 Subject: [PATCH 26/35] Enhance credential handling in Compose and Standalone Executors - Updated credential processing to support dict-style mappings, raising clear errors for unsupported configurations. - Improved error messages for missing credentials, guiding users on the correct format for credential specification. - Adjusted cleanup logic in `run_workflow_compose` and `run_workflow_standalone` to ensure work directories are only removed on successful execution. --- src/utils/compose_executor.py | 16 +++++++++++++--- src/utils/standalone_executor.py | 28 ++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index bfe718841..510efcf53 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -190,8 +190,13 @@ def _validate_for_compose(self, spec: workflow_module.WorkflowSpec): '— data is available in the work directory', task_spec.name, output.dataset.name) - for cred_name in task_spec.credentials: - if cred_name not in self._credentials: + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" uses ' + f'dict-style mapping which is not supported in docker-compose ' + f'mode; provide credentials as NAME=/path') + elif cred_name not in self._credentials: unsupported_features.append( f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' f'Use --credential {cred_name}=/path/to/dir') @@ -366,6 +371,11 @@ def _build_compose_service(self, node: TaskNode, volumes.append(f'{host_path}:{file_spec.path}:ro') for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + raise ValueError( + f'Task "{node.name}": credential "{cred_name}" uses dict-style ' + f'mapping which is not supported in docker-compose mode; ' + f'provide credentials as NAME=/path') if isinstance(cred_mount, str) and cred_name in self._credentials: local_dir = os.path.abspath(self._credentials[cred_name]) volumes.append(f'{local_dir}:{cred_mount}:ro') @@ -540,7 +550,7 @@ def run_workflow_compose(spec_path: str, work_dir: str | None = None, spec = executor.load_spec(spec_text) success = executor.execute(spec) finally: - if created_work_dir and not keep_work_dir: + if created_work_dir and not keep_work_dir and success: shutil.rmtree(work_dir, ignore_errors=True) elif not success: logger.info('Work directory preserved for debugging: %s', work_dir) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 1d8f20d29..0a36a24cd 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -74,11 +74,12 @@ class StandaloneExecutor: - `environment:` passed as Docker env vars - Task-to-task data flow via shared local directories - GPU passthrough via --gpus for tasks that declare gpu > 0 in resources + - Credentials via --credential NAME=/path (mounted read-only in _run_task) + - Jinja-templated specs (expanded locally via _expand_jinja_locally) Does NOT support (raises clear errors): - Dataset / URL inputs/outputs (require object storage) - - Credentials, checkpoints, volumeMounts (require cluster infra) - - Templated specs with Jinja (require server-side expansion; use --dry-run first) + - Checkpoints, volumeMounts (require cluster infra) - {{host:taskname}} tokens (require parallel containers with shared networking) """ @@ -354,8 +355,14 @@ def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): '— data is available in the work directory', task_spec.name, output.dataset.name) - for cred_name in task_spec.credentials: - if cred_name not in self._credentials: + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" uses ' + f'dict-style mapping which the standalone executor does not ' + f'support; provide credentials as NAME=/path or flatten the ' + f'mapping') + elif cred_name not in self._credentials: unsupported_features.append( f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' f'Use --credential {cred_name}=/path/to/dir') @@ -513,6 +520,11 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + raise ValueError( + f'Task "{node.name}": credential "{cred_name}" uses dict-style ' + f'mapping which the standalone executor does not support; ' + f'provide credentials as NAME=/path or flatten the mapping') if isinstance(cred_mount, str) and cred_name in self._credentials: local_dir = os.path.abspath(self._credentials[cred_name]) docker_args += ['-v', f'{local_dir}:{cred_mount}:ro'] @@ -577,7 +589,7 @@ def _check_unresolved_tokens(self, task_name: str, resolved_fields: List[str]): f'Use --set to provide values, or check for typos in template variable names.') -_OSMO_TOKEN_PATTERN = re.compile(r'\{\{(uuid|workflow_id|output|input:[^}]+|host:[^}]+)\}\}') +_OSMO_TOKEN_PATTERN = re.compile(r'\{\{\s*(uuid|workflow_id|output|input:[^}]+|host:[^}]+)\s*\}\}') def _expand_jinja_locally(spec_text: str, @@ -618,7 +630,7 @@ def _expand_jinja_locally(spec_text: str, for match in _OSMO_TOKEN_PATTERN.finditer(file_text): field = match.group(1).strip() hash_key = 'hash' + str(int(hashlib.md5(field.encode('utf-8')).hexdigest(), 16)) - original_token = '{{' + match.group(1) + '}}' + original_token = match.group(0) template_data[hash_key] = original_token placeholder_map[original_token] = hash_key @@ -633,7 +645,7 @@ def _expand_jinja_locally(spec_text: str, def _spec_has_templates(spec_text: str) -> bool: """Return True if the spec contains Jinja template markers that need expansion.""" - return any(marker in spec_text for marker in ('{%', '{#', 'default-values')) + return any(marker in spec_text for marker in ('{{', '{%', '{#', 'default-values')) def run_workflow_standalone(spec_path: str, work_dir: str | None = None, @@ -671,7 +683,7 @@ def run_workflow_standalone(spec_path: str, work_dir: str | None = None, success = executor.execute(spec, resume=resume or from_step is not None, from_step=from_step) finally: - if created_work_dir and not keep_work_dir: + if created_work_dir and not keep_work_dir and success: shutil.rmtree(work_dir, ignore_errors=True) elif not success: logger.info('Work directory preserved for debugging: %s', work_dir) From adad9d7c185150b617543415c1ea67ace4840bb7 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 17:44:44 -0700 Subject: [PATCH 27/35] Enhance workflow fingerprinting in StandaloneExecutor - Introduced a new method `_compute_workflow_fingerprint` to generate a SHA-256 hash of task specifications, allowing for detection of workflow changes across runs. - Updated the state management to include the workflow fingerprint, enabling verification of spec consistency between runs and warning users of potential stale outputs. - Improved GPU device specification formatting in Docker arguments to ensure correct syntax without unnecessary quotes. --- src/utils/compose_executor.py | 4 +++- src/utils/standalone_executor.py | 41 ++++++++++++++++++++++++++++---- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index 510efcf53..c0be7b1b0 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -75,7 +75,9 @@ def _compose_file_path(self) -> str: return os.path.join(self._work_dir, COMPOSE_FILE_NAME) def _compose_project_name(self, spec: workflow_module.WorkflowSpec) -> str: - return f'osmo-{re.sub(r"[^a-z0-9-]", "-", spec.name.lower())}' + sanitized = re.sub(r'[^a-z0-9-]', '-', spec.name.lower()) + sanitized = re.sub(r'-{2,}', '-', sanitized).strip('-') + return f'osmo-{sanitized}' if sanitized else 'osmo-default' def _compose_base_cmd(self, spec: workflow_module.WorkflowSpec) -> List[str]: return ( diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 0a36a24cd..47164d16a 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -98,6 +98,7 @@ def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = self._group_specs: Dict[str, task_module.TaskGroupSpec] = {} self._results: Dict[str, TaskResult] = {} self._available_gpus: int | None = None + self._workflow_fingerprint: str = '' def _detect_available_gpus(self) -> int: """Query nvidia-smi to count available GPUs, caching the result for subsequent calls.""" @@ -122,6 +123,23 @@ def _detect_available_gpus(self) -> int: self._available_gpus = 0 return self._available_gpus + def _compute_workflow_fingerprint(self) -> str: + """Compute a deterministic SHA-256 hash from task specs to detect workflow changes across runs.""" + fingerprint_data: List[Dict[str, Any]] = [] + for name in sorted(self._task_nodes): + spec = self._task_nodes[name].spec + fingerprint_data.append({ + 'name': name, + 'image': spec.image, + 'command': spec.command, + 'args': spec.args, + 'environment': dict(sorted(spec.environment.items())), + 'inputs': [str(i) for i in spec.inputs], + 'resource': spec.resource, + }) + blob = json.dumps(fingerprint_data, sort_keys=True, separators=(',', ':')) + return hashlib.sha256(blob.encode('utf-8')).hexdigest() + def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: """Parse raw YAML text into a validated WorkflowSpec via the versioned spec model.""" raw = yaml.safe_load(spec_text) @@ -137,6 +155,7 @@ def execute(self, spec: workflow_module.WorkflowSpec, """Run all tasks in topological order, returning True if the entire workflow succeeds.""" self._results.clear() self._build_dag(spec) + self._workflow_fingerprint = self._compute_workflow_fingerprint() self._validate_for_standalone(spec) self._setup_directories() @@ -202,12 +221,13 @@ def _state_file_path(self) -> str: def _save_state(self): """Persist current task results to the state file so runs can be resumed later.""" - state = { + state: Dict[str, Any] = { + 'workflow_fingerprint': self._workflow_fingerprint, 'tasks': { name: {'exit_code': result.exit_code, 'output_dir': result.output_dir} for name, result in self._results.items() if result.exit_code != -1 - } + }, } with open(self._state_file_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2) @@ -226,6 +246,17 @@ def _restore_completed_tasks(self, from_step: str | None = None): logger.info('No previous state found — starting from scratch') return + saved_fingerprint = state.get('workflow_fingerprint') + if not saved_fingerprint: + logger.warning( + 'State file has no workflow fingerprint — cannot verify ' + 'that the spec matches the previous run; reused outputs may be stale') + elif saved_fingerprint != self._workflow_fingerprint: + logger.warning( + 'Workflow spec has changed since the previous run ' + '(fingerprint %s → %s); reused outputs may be stale', + saved_fingerprint[:12], self._workflow_fingerprint[:12]) + completed: Dict[str, Dict] = {} for name, info in state.get('tasks', {}).items(): if name not in self._task_nodes: @@ -496,16 +527,16 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR logger.warning( 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', node.name, gpu_count, available, available) - docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(available))}"'] + docker_args += ['--gpus', f'device={",".join(str(i) for i in range(available))}'] else: - docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] + docker_args += ['--gpus', f'device={",".join(str(i) for i in range(gpu_count))}'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) docker_args += ['--shm-size', self._shm_size or self.DEFAULT_SHM_SIZE] elif self._shm_size: docker_args += ['--shm-size', self._shm_size] - for env_key, resolved_value in zip(task_spec.environment.keys(), resolved_env_values): + for env_key, resolved_value in zip(task_spec.environment.keys(), resolved_env_values, strict=True): docker_args += ['-e', f'{env_key}={resolved_value}'] docker_args += ['-v', f'{output_dir}:{CONTAINER_DATA_PATH}/output'] From e778e58aed600f09274c6896f4b6bab8bf610e12 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 18:18:31 -0700 Subject: [PATCH 28/35] Improve error handling and logging in Compose and Standalone Executors - Added error logging for JSON parsing failures in `ComposeExecutor` to aid in debugging. - Enhanced warning messages in `ComposeExecutor` to include the full Docker compose output when no container info is found. - Updated `StandaloneExecutor` to use `deque` for improved performance in task processing queues. --- src/utils/compose_executor.py | 5 ++++- src/utils/standalone_executor.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index c0be7b1b0..e81f1ea88 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -497,6 +497,7 @@ def _get_service_exit_code(self, service_name: str, try: container_info = json.loads(line) except json.JSONDecodeError: + logger.error('Failed to parse container info line as JSON: %s', line, exc_info=True) continue if isinstance(container_info, list): for entry in container_info: @@ -505,7 +506,9 @@ def _get_service_exit_code(self, service_name: str, elif container_info.get('Service') == service_name: return container_info.get('ExitCode', 1) - logger.warning('No container info found for service "%s"', service_name) + logger.warning( + 'No container info found for service "%s" in docker compose output:\n%s', + service_name, result.stdout.strip()) return 1 except (subprocess.TimeoutExpired, FileNotFoundError): logger.warning('Could not determine exit code for "%s"', service_name) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 47164d16a..290287185 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -25,6 +25,7 @@ import shutil import subprocess import tempfile +from collections import deque from typing import Any, Dict, List, Set import jinja2 @@ -128,6 +129,10 @@ def _compute_workflow_fingerprint(self) -> str: fingerprint_data: List[Dict[str, Any]] = [] for name in sorted(self._task_nodes): spec = self._task_nodes[name].spec + files_data = [ + {'path': f.path, 'contents': f.contents, 'base64': f.base64} + for f in sorted(spec.files, key=lambda f: f.path) + ] if spec.files else [] fingerprint_data.append({ 'name': name, 'image': spec.image, @@ -136,6 +141,7 @@ def _compute_workflow_fingerprint(self) -> str: 'environment': dict(sorted(spec.environment.items())), 'inputs': [str(i) for i in spec.inputs], 'resource': spec.resource, + 'files': files_data, }) blob = json.dumps(fingerprint_data, sort_keys=True, separators=(',', ':')) return hashlib.sha256(blob.encode('utf-8')).hexdigest() @@ -280,9 +286,9 @@ def _restore_completed_tasks(self, from_step: str | None = None): def _get_downstream_tasks(self, task_name: str) -> Set[str]: """Return all transitive downstream dependents of the given task via BFS.""" visited: Set[str] = set() - queue = [task_name] + queue: deque[str] = deque([task_name]) while queue: - current = queue.pop(0) + current = queue.popleft() for downstream in self._task_nodes[current].downstream: if downstream not in visited: visited.add(downstream) @@ -466,9 +472,9 @@ def _find_ready_tasks(self) -> List[str]: def _cancel_downstream(self, failed_task: str): """Mark all transitive downstream tasks of a failed task as cancelled (exit_code -1).""" visited: Set[str] = set() - queue = [failed_task] + queue: deque[str] = deque([failed_task]) while queue: - current = queue.pop(0) + current = queue.popleft() for downstream in self._task_nodes[current].downstream: if downstream not in visited and downstream not in self._results: visited.add(downstream) From f6df82335cba2d2a4da7af6d55bbc65836de9140 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 18:53:40 -0700 Subject: [PATCH 29/35] Refactor Compose and Standalone Executors for improved command handling and state management - Replaced string splitting with `shlex.split` in `ComposeExecutor` to ensure correct parsing of command arguments. - Enhanced `_compute_workflow_fingerprint` in `StandaloneExecutor` to accept `WorkflowSpec` directly, improving clarity and functionality. - Updated state file handling in `StandaloneExecutor` to use a temporary file for safer writes and added error handling for corrupt state files. --- src/utils/compose_executor.py | 3 +- src/utils/standalone_executor.py | 58 +++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 16 deletions(-) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py index e81f1ea88..97f490f48 100644 --- a/src/utils/compose_executor.py +++ b/src/utils/compose_executor.py @@ -20,6 +20,7 @@ import logging import os import re +import shlex import shutil import subprocess import tempfile @@ -81,7 +82,7 @@ def _compose_project_name(self, spec: workflow_module.WorkflowSpec) -> str: def _compose_base_cmd(self, spec: workflow_module.WorkflowSpec) -> List[str]: return ( - self._compose_cmd.split() + shlex.split(self._compose_cmd) + ['-p', self._compose_project_name(spec), '-f', self._compose_file_path] ) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 290287185..6d608cccb 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -25,6 +25,7 @@ import shutil import subprocess import tempfile +import time from collections import deque from typing import Any, Dict, List, Set @@ -124,23 +125,28 @@ def _detect_available_gpus(self) -> int: self._available_gpus = 0 return self._available_gpus - def _compute_workflow_fingerprint(self) -> str: + def _compute_workflow_fingerprint(self, spec: workflow_module.WorkflowSpec) -> str: """Compute a deterministic SHA-256 hash from task specs to detect workflow changes across runs.""" fingerprint_data: List[Dict[str, Any]] = [] for name in sorted(self._task_nodes): - spec = self._task_nodes[name].spec + task_spec = self._task_nodes[name].spec files_data = [ {'path': f.path, 'contents': f.contents, 'base64': f.base64} - for f in sorted(spec.files, key=lambda f: f.path) - ] if spec.files else [] + for f in sorted(task_spec.files, key=lambda f: f.path) + ] if task_spec.files else [] + named_resource = spec.resources.get(task_spec.resource) + named_resource_dict = named_resource.model_dump(exclude_none=True) if named_resource else {} + inline_resource_dict = task_spec.resources.model_dump(exclude_defaults=True) + effective_resource = {**named_resource_dict, **inline_resource_dict} fingerprint_data.append({ 'name': name, - 'image': spec.image, - 'command': spec.command, - 'args': spec.args, - 'environment': dict(sorted(spec.environment.items())), - 'inputs': [str(i) for i in spec.inputs], - 'resource': spec.resource, + 'image': task_spec.image, + 'command': task_spec.command, + 'args': task_spec.args, + 'environment': dict(sorted(task_spec.environment.items())), + 'inputs': [str(i) for i in task_spec.inputs], + 'resource': task_spec.resource, + 'resource_config': effective_resource, 'files': files_data, }) blob = json.dumps(fingerprint_data, sort_keys=True, separators=(',', ':')) @@ -161,7 +167,7 @@ def execute(self, spec: workflow_module.WorkflowSpec, """Run all tasks in topological order, returning True if the entire workflow succeeds.""" self._results.clear() self._build_dag(spec) - self._workflow_fingerprint = self._compute_workflow_fingerprint() + self._workflow_fingerprint = self._compute_workflow_fingerprint(spec) self._validate_for_standalone(spec) self._setup_directories() @@ -235,15 +241,37 @@ def _save_state(self): if result.exit_code != -1 }, } - with open(self._state_file_path, 'w', encoding='utf-8') as f: + tmp_path = self._state_file_path + '.tmp' + with open(tmp_path, 'w', encoding='utf-8') as f: json.dump(state, f, indent=2) + f.flush() + os.fsync(f.fileno()) + os.replace(tmp_path, self._state_file_path) + state_dir = os.path.dirname(self._state_file_path) or '.' + dir_fd = os.open(state_dir, os.O_RDONLY) + try: + os.fsync(dir_fd) + finally: + os.close(dir_fd) def _load_state(self) -> Dict | None: - """Load previously saved task state from disk, returning None if no state file exists.""" + """Load previously saved task state from disk, returning None if no state file exists or if the file is corrupt.""" if not os.path.exists(self._state_file_path): return None - with open(self._state_file_path, encoding='utf-8') as f: - return json.load(f) + try: + with open(self._state_file_path, encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as exc: + corrupt_path = f'{self._state_file_path}.corrupt.{int(time.time())}' + try: + os.rename(self._state_file_path, corrupt_path) + logger.warning( + 'State file is corrupt (%s); renamed to %s and starting fresh', + exc, corrupt_path) + except OSError: + logger.warning( + 'State file is corrupt (%s); starting fresh', exc) + return None def _restore_completed_tasks(self, from_step: str | None = None): """Reload completed tasks from a previous run, optionally invalidating from a given step onward.""" From e9647a6961191c1c33033f87d8579521c5531a38 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 19:30:14 -0700 Subject: [PATCH 30/35] Refactor hash generation in StandaloneExecutor and improve cycle detection logic - Updated the cycle detection logic to use list unpacking for clarity in `StandaloneExecutor`. - Changed hash generation from MD5 to SHA-256 for improved security in `_expand_jinja_locally`, ensuring better uniqueness and collision resistance. --- src/utils/standalone_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 6d608cccb..5486386d5 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -377,7 +377,7 @@ def visit(name: str) -> List[str] | None: return None if state[name] == IN_PROGRESS: cycle_start = path.index(name) - return path[cycle_start:] + [name] + return [*path[cycle_start:], name] state[name] = IN_PROGRESS path.append(name) @@ -694,7 +694,7 @@ def _expand_jinja_locally(spec_text: str, placeholder_map: Dict[str, str] = {} for match in _OSMO_TOKEN_PATTERN.finditer(file_text): field = match.group(1).strip() - hash_key = 'hash' + str(int(hashlib.md5(field.encode('utf-8')).hexdigest(), 16)) + hash_key = 'hash' + str(int(hashlib.sha256(field.encode('utf-8')).hexdigest(), 16)) original_token = match.group(0) template_data[hash_key] = original_token placeholder_map[original_token] = hash_key From 3c91e1d0f9fb87fbfcc1d4fbd2feff19de372610 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 19:41:04 -0700 Subject: [PATCH 31/35] Refactor file writing logic in StandaloneExecutor to support base64 encoding - Introduced base64 decoding for file writing in `StandaloneExecutor`, allowing for proper handling of encoded content. - Ensured cleanup of rerun output directories is executed when resuming tasks, improving state management. --- src/utils/standalone_executor.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 5486386d5..05757e6c1 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -16,6 +16,7 @@ SPDX-License-Identifier: Apache-2.0 """ +import base64 as base64_module import dataclasses import hashlib import json @@ -173,7 +174,8 @@ def execute(self, spec: workflow_module.WorkflowSpec, if resume or from_step: self._restore_completed_tasks(from_step) - self._clean_rerun_output_dirs() + + self._clean_rerun_output_dirs() total_tasks = sum(len(g.tasks) for g in self._groups(spec)) skipped = len(self._results) @@ -537,8 +539,12 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR raise ValueError( f'Task "{node.name}": file path "{file_spec.path}" escapes the task directory') os.makedirs(os.path.dirname(host_path), exist_ok=True) - with open(host_path, 'w', encoding='utf-8') as f: - f.write(resolved_contents) + if file_spec.base64: + with open(host_path, 'wb') as f: + f.write(base64_module.b64decode(resolved_contents)) + else: + with open(host_path, 'w', encoding='utf-8') as f: + f.write(resolved_contents) resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] resolved_args = [self._substitute_tokens(a, token_map) for a in task_spec.args] From 0bd4a99ee24cff9fab317886d8cdd883ae83d375 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Fri, 10 Apr 2026 21:50:47 -0700 Subject: [PATCH 32/35] Enhance credential handling in StandaloneExecutor - Added support for credential data mapping in `_compute_workflow_fingerprint`, allowing for better management of task-specific credentials. - Updated `_validate_for_standalone` to raise a ValueError for unsupported timeout features in standalone mode, improving error handling and user guidance. - Ensured directory setup is executed correctly during workflow execution, enhancing state management. --- src/utils/standalone_executor.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index 05757e6c1..d3ac8a94e 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -139,6 +139,14 @@ def _compute_workflow_fingerprint(self, spec: workflow_module.WorkflowSpec) -> s named_resource_dict = named_resource.model_dump(exclude_none=True) if named_resource else {} inline_resource_dict = task_spec.resources.model_dump(exclude_defaults=True) effective_resource = {**named_resource_dict, **inline_resource_dict} + credentials_data = { + cred_name: { + 'mount_path': cred_mount, + 'source': os.path.abspath(self._credentials[cred_name]) + if cred_name in self._credentials else None, + } + for cred_name, cred_mount in sorted(task_spec.credentials.items()) + } if task_spec.credentials else {} fingerprint_data.append({ 'name': name, 'image': task_spec.image, @@ -149,6 +157,7 @@ def _compute_workflow_fingerprint(self, spec: workflow_module.WorkflowSpec) -> s 'resource': task_spec.resource, 'resource_config': effective_resource, 'files': files_data, + 'credentials': credentials_data, }) blob = json.dumps(fingerprint_data, sort_keys=True, separators=(',', ':')) return hashlib.sha256(blob.encode('utf-8')).hexdigest() @@ -170,11 +179,11 @@ def execute(self, spec: workflow_module.WorkflowSpec, self._build_dag(spec) self._workflow_fingerprint = self._compute_workflow_fingerprint(spec) self._validate_for_standalone(spec) - self._setup_directories() if resume or from_step: self._restore_completed_tasks(from_step) + self._setup_directories() self._clean_rerun_output_dirs() total_tasks = sum(len(g.tasks) for g in self._groups(spec)) @@ -402,6 +411,12 @@ def visit(name: str) -> List[str] | None: def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): """Raise ValueError if the spec uses features unsupported in standalone mode.""" unsupported_features = [] + + if spec.timeout.exec_timeout is not None or spec.timeout.queue_timeout is not None: + unsupported_features.append( + 'WorkflowSpec.timeout is not supported in standalone mode; ' + 'use the service executor or remove the timeout') + for group in self._groups(spec): for task_spec in group.tasks: for input_source in task_spec.inputs: From e782e4de1b8551532f3de1abaaa5c783ea52edfa Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Wed, 27 May 2026 11:58:53 -0700 Subject: [PATCH 33/35] Refactor GPU handling in StandaloneExecutor - Updated GPU allocation logic to use 'all' when requested GPUs exceed available resources, improving resource management. - Adjusted logging messages for clarity when running tasks without GPU support or with limited GPU availability. - Ensured correct formatting for GPU device specifications in Docker arguments. --- src/utils/standalone_executor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index d3ac8a94e..ea8701a3a 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -578,13 +578,14 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR logger.warning( 'Task "%s" requests %d GPU(s) but no GPUs are available — running without GPU support', node.name, gpu_count) - elif gpu_count > available: - logger.warning( - 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', - node.name, gpu_count, available, available) - docker_args += ['--gpus', f'device={",".join(str(i) for i in range(available))}'] + elif gpu_count >= available: + if gpu_count > available: + logger.warning( + 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', + node.name, gpu_count, available, available) + docker_args += ['--gpus', 'all'] else: - docker_args += ['--gpus', f'device={",".join(str(i) for i in range(gpu_count))}'] + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) docker_args += ['--shm-size', self._shm_size or self.DEFAULT_SHM_SIZE] From 1be5606225718167353b5d333c5882c094c6ebcf Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 4 Jun 2026 11:39:31 -0700 Subject: [PATCH 34/35] Refactor file handling and enhance test coverage in StandaloneExecutor and ComposeExecutor - Updated file writing logic in `StandaloneExecutor` to use more descriptive variable names for file handlers, improving code readability. - Enhanced test cases in `TestComposeValidation` and `TestJinjaTemplateDetection` to better validate credential handling and Jinja template expansion, ensuring robustness in workflow specifications. - Adjusted error messages for clarity in credential validation tests, providing more informative feedback for users. --- src/utils/standalone_executor.py | 8 +- src/utils/tests/test_compose_executor.py | 74 ++++++++----- src/utils/tests/test_standalone_executor.py | 116 +++++++++----------- 3 files changed, 99 insertions(+), 99 deletions(-) diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py index ea8701a3a..7aba35fbf 100644 --- a/src/utils/standalone_executor.py +++ b/src/utils/standalone_executor.py @@ -555,11 +555,11 @@ def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskR f'Task "{node.name}": file path "{file_spec.path}" escapes the task directory') os.makedirs(os.path.dirname(host_path), exist_ok=True) if file_spec.base64: - with open(host_path, 'wb') as f: - f.write(base64_module.b64decode(resolved_contents)) + with open(host_path, 'wb') as binary_file: + binary_file.write(base64_module.b64decode(resolved_contents)) else: - with open(host_path, 'w', encoding='utf-8') as f: - f.write(resolved_contents) + with open(host_path, 'w', encoding='utf-8') as text_file: + text_file.write(resolved_contents) resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] resolved_args = [self._substitute_tokens(a, token_map) for a in task_spec.args] diff --git a/src/utils/tests/test_compose_executor.py b/src/utils/tests/test_compose_executor.py index e824d2b9a..a2276b011 100644 --- a/src/utils/tests/test_compose_executor.py +++ b/src/utils/tests/test_compose_executor.py @@ -32,7 +32,12 @@ ComposeExecutor, run_workflow_compose, ) -from src.utils.standalone_executor import CONTAINER_DATA_PATH, TaskResult +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + TaskResult, + _expand_jinja_locally, + _spec_has_templates, +) def _docker_compose_available() -> bool: @@ -437,8 +442,8 @@ def test_dataset_input_rejected(self): executor._validate_for_compose(spec) self.assertIn('dataset', str(context.exception)) - def test_credentials_rejected(self): - """Credentials are rejected in compose mode.""" + def test_credential_not_provided_rejected(self): + """A credential required by a task but not supplied via --credential is rejected.""" spec_text = textwrap.dedent('''\ workflow: name: bad @@ -454,7 +459,25 @@ def test_credentials_rejected(self): executor._build_dag(spec) with self.assertRaises(ValueError) as context: executor._validate_for_compose(spec) - self.assertIn('credentials', str(context.exception)) + self.assertIn('credential', str(context.exception)) + + def test_provided_credential_passes(self): + """A credential supplied via --credential is accepted in compose mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + ''') + executor = ComposeExecutor( + work_dir='/tmp/unused', credentials={'my-secret': '/tmp/secret-dir'}) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) def test_simple_spec_passes(self): """A simple spec with only task-to-task inputs passes compose validation.""" @@ -637,33 +660,26 @@ def test_name_with_special_chars(self): class TestJinjaTemplateDetection(unittest.TestCase): - """Verify that Jinja templates are rejected before execution.""" + """Verify that Jinja templates are detected and expanded locally before execution.""" - def _write_temp_spec(self, content: str) -> str: - f = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) - f.write(content) - f.flush() - f.close() - return f.name - - def test_jinja_block_detected(self): - path = self._write_temp_spec(textwrap.dedent('''\ + def test_jinja_block_expanded(self): + """A spec with {% %} Jinja block tags is detected and expanded locally.""" + spec_text = textwrap.dedent('''\ workflow: name: {% if true %}test{% endif %} tasks: - name: task image: alpine:3.18 command: ["echo"] - ''')) - try: - with self.assertRaises(ValueError) as context: - run_workflow_compose(path) - self.assertIn('Jinja', str(context.exception)) - finally: - os.unlink(path) + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{%', expanded) + self.assertIn('name: test', expanded) - def test_default_values_detected(self): - path = self._write_temp_spec(textwrap.dedent('''\ + def test_default_values_expanded(self): + """A spec with a default-values section has its {{ }} variables expanded locally.""" + spec_text = textwrap.dedent('''\ workflow: name: "{{experiment}}" tasks: @@ -672,13 +688,11 @@ def test_default_values_detected(self): command: ["echo"] default-values: experiment: test - ''')) - try: - with self.assertRaises(ValueError) as context: - run_workflow_compose(path) - self.assertIn('Jinja', str(context.exception)) - finally: - os.unlink(path) + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('test', expanded) class TestUnresolvedTokenDetection(unittest.TestCase): diff --git a/src/utils/tests/test_standalone_executor.py b/src/utils/tests/test_standalone_executor.py index 2eae253f6..debf61568 100644 --- a/src/utils/tests/test_standalone_executor.py +++ b/src/utils/tests/test_standalone_executor.py @@ -25,7 +25,15 @@ from unittest import mock from src.utils.job import task as task_module -from src.utils.standalone_executor import CONTAINER_DATA_PATH, StandaloneExecutor, TaskNode, TaskResult, run_workflow_standalone +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + StandaloneExecutor, + TaskNode, + TaskResult, + _expand_jinja_locally, + _spec_has_templates, + run_workflow_standalone, +) # --------------------------------------------------------------------------- @@ -784,11 +792,11 @@ def test_url_input_rejected(self): executor._validate_for_standalone(spec) self.assertIn('URL', str(context.exception)) - def test_dataset_output_rejected(self): - """A spec with dataset outputs is rejected as unsupported in standalone mode.""" + def test_dataset_output_ignored(self): + """A spec with dataset outputs passes validation; the output is ignored in standalone mode.""" spec_text = textwrap.dedent('''\ workflow: - name: bad + name: ok tasks: - name: task image: ubuntu:24.04 @@ -800,9 +808,7 @@ def test_dataset_output_rejected(self): executor = self._make_executor() spec = executor.load_spec(spec_text) executor._build_dag(spec) - with self.assertRaises(ValueError) as context: - executor._validate_for_standalone(spec) - self.assertIn('dataset', str(context.exception).lower()) + executor._validate_for_standalone(spec) def test_url_output_rejected(self): """A spec with URL outputs is rejected as unsupported in standalone mode.""" @@ -906,7 +912,7 @@ class TestValidateForStandaloneRemainingBranches(unittest.TestCase): credentials: my-secret: NGC_API_KEY '''), - 'expected_substring': 'credentials', + 'expected_substring': 'credential', }, 'checkpoint': { 'yaml': textwrap.dedent('''\ @@ -1350,8 +1356,8 @@ def test_valid_osmo_tokens_not_flagged(self, mock_run): spec = executor.load_spec(spec_text) executor.execute(spec) - def test_error_message_suggests_dry_run(self): - """The unresolved token error message suggests using --dry-run to expand templates.""" + def test_error_message_suggests_set(self): + """The unresolved token error message suggests using --set to provide values.""" spec_text = textwrap.dedent('''\ workflow: name: helpful @@ -1364,7 +1370,7 @@ def test_error_message_suggests_dry_run(self): spec = executor.load_spec(spec_text) with self.assertRaises(ValueError) as context: executor.execute(spec) - self.assertIn('dry-run', str(context.exception)) + self.assertIn('--set', str(context.exception)) class TestShmSize(unittest.TestCase): @@ -1482,36 +1488,26 @@ def test_non_gpu_task_gets_explicit_shm_size(self, mock_run): class TestJinjaTemplateDetection(unittest.TestCase): - """Verify that specs containing Jinja template markers are rejected before execution.""" - - def _write_temp_spec(self, content: str) -> str: - """Write YAML content to a temporary file and return its path.""" - f = tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) - f.write(content) - f.flush() - f.close() - return f.name + """Verify that specs containing Jinja template markers are expanded locally before execution.""" - def test_jinja_block_detected(self): - """A spec containing {% %} Jinja block tags is rejected.""" - path = self._write_temp_spec(textwrap.dedent('''\ + def test_jinja_block_expanded(self): + """A spec containing {% %} Jinja block tags is detected and expanded locally.""" + spec_text = textwrap.dedent('''\ workflow: name: {% if true %}test{% endif %} tasks: - name: task image: alpine:3.18 command: ["echo"] - ''')) - try: - with self.assertRaises(ValueError) as context: - run_workflow_standalone(path) - self.assertIn('Jinja', str(context.exception)) - finally: - os.unlink(path) + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{%', expanded) + self.assertIn('name: test', expanded) - def test_jinja_comment_detected(self): - """A spec containing {# #} Jinja comment tags is rejected.""" - path = self._write_temp_spec(textwrap.dedent('''\ + def test_jinja_comment_expanded(self): + """A spec containing {# #} Jinja comment tags is detected and stripped locally.""" + spec_text = textwrap.dedent('''\ {# A comment #} workflow: name: test @@ -1519,17 +1515,15 @@ def test_jinja_comment_detected(self): - name: task image: alpine:3.18 command: ["echo"] - ''')) - try: - with self.assertRaises(ValueError) as context: - run_workflow_standalone(path) - self.assertIn('Jinja', str(context.exception)) - finally: - os.unlink(path) + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{#', expanded) + self.assertIn('name: test', expanded) - def test_default_values_section_detected(self): - """A spec containing a 'default-values' section is rejected as a Jinja template.""" - path = self._write_temp_spec(textwrap.dedent('''\ + def test_default_values_section_expanded(self): + """A spec containing a 'default-values' section has its variables expanded locally.""" + spec_text = textwrap.dedent('''\ workflow: name: "{{experiment_name}}" tasks: @@ -1538,13 +1532,11 @@ def test_default_values_section_detected(self): command: ["echo"] default-values: experiment_name: my-experiment - ''')) - try: - with self.assertRaises(ValueError) as context: - run_workflow_standalone(path) - self.assertIn('Jinja', str(context.exception)) - finally: - os.unlink(path) + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('my-experiment', expanded) # ============================================================================ @@ -1620,24 +1612,18 @@ def test_unsupported_spec_data_upload(self): self._run_cookbook_spec('data_upload.yaml') self.assertIn('object storage', str(context.exception).lower()) - def test_unsupported_spec_dataset_upload(self): - """dataset_upload.yaml uses dataset outputs — verify it is cleanly rejected.""" - with self.assertRaises(ValueError) as context: - self._run_cookbook_spec('dataset_upload.yaml') - self.assertIn('dataset', str(context.exception).lower()) - - def test_unsupported_spec_template(self): - """template_hello_world.yaml uses default-values templating — verify it is rejected.""" + def test_template_spec_expanded_locally(self): + """template_hello_world.yaml uses default-values templating — verify it expands locally.""" spec_path = os.path.join(self.COOKBOOK_DIR, 'template_hello_world.yaml') self.assertTrue(os.path.exists(spec_path), f'Cookbook file not found: {spec_path}') - with self.assertRaises(ValueError) as context: - run_workflow_standalone( - spec_path=spec_path, - work_dir=self.work_dir, - keep_work_dir=True, - ) - self.assertIn('Jinja', str(context.exception)) + with open(spec_path, encoding='utf-8') as f: + spec_text = f.read() + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('hello-osmo', expanded) + self.assertIn('Hello from OSMO!', expanded) class TestRunWorkflowStandaloneErrors(unittest.TestCase): From 3cce831b863a00c50e4f88247b3ebed031d66b45 Mon Sep 17 00:00:00 2001 From: Mauricio Franco Date: Thu, 4 Jun 2026 11:59:08 -0700 Subject: [PATCH 35/35] Refactor test setup in TestComposeValidation to use temporary directories for credential handling - Updated the test case to create a temporary directory for storing credentials, improving isolation and preventing side effects during tests. - Enhanced readability by removing hardcoded paths and utilizing context management for temporary directory creation. --- src/utils/tests/test_compose_executor.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/utils/tests/test_compose_executor.py b/src/utils/tests/test_compose_executor.py index a2276b011..f70ea8129 100644 --- a/src/utils/tests/test_compose_executor.py +++ b/src/utils/tests/test_compose_executor.py @@ -473,11 +473,14 @@ def test_provided_credential_passes(self): credentials: my-secret: NGC_API_KEY ''') - executor = ComposeExecutor( - work_dir='/tmp/unused', credentials={'my-secret': '/tmp/secret-dir'}) - spec = executor.load_spec(spec_text) - executor._build_dag(spec) - executor._validate_for_compose(spec) + with tempfile.TemporaryDirectory() as tmp_dir: + secret_dir = os.path.join(tmp_dir, 'secret-dir') + os.makedirs(secret_dir) + executor = ComposeExecutor( + work_dir=tmp_dir, credentials={'my-secret': secret_dir}) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) def test_simple_spec_passes(self): """A simple spec with only task-to-task inputs passes compose validation."""