diff --git a/.gitignore b/.gitignore index 2fe57a694..f5180c6e9 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,7 @@ docs/**/domain_config.js .ruff_cache .lycheecache + +.venv/ +build/ +*.egg-info \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md index 45b388253..67e102378 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -120,6 +120,8 @@ Entry point: `service/core/service.py`. Framework: FastAPI + Uvicorn + OpenTelem | `utils/job/` | `Task`, `FrontendJob`, `K8sObjectFactory`, `PodGroupTopologyBuilder` | Workflow execution framework. Task → K8s spec generation. Gang scheduling via PodGroup. Topology constraints. Backend job definitions. | | `utils/connectors/` | `ClusterConnector`, `PostgresConnector`, `RedisConnector` | K8s API wrapper, PostgreSQL operations, Redis job queue management. | | `utils/secret_manager/` | `SecretManager` | JWE-based secret encryption/decryption. MEK/UEK key management. | +| `utils/standalone_executor.py` | `StandaloneExecutor`, `run_workflow_standalone` | Standalone Docker-based workflow execution. Runs workflow specs without Kubernetes by mapping tasks to `docker run` commands with volume mounts for data flow. Supports DAG scheduling, resume (`--from-step`), and GPU passthrough. | +| `utils/compose_executor.py` | `ComposeExecutor`, `run_workflow_compose` | Docker Compose-based parallel workflow execution. Extends StandaloneExecutor with wave-parallel scheduling, `{{host:taskname}}` DNS resolution via shared Compose networks, and GPU support via deploy resource reservations. | | `utils/progress_check/` | — | Liveness/progress tracking for long-running services. | | `utils/metrics/` | — | Prometheus metrics collection and export. | @@ -139,6 +141,8 @@ Entry point: `cli.py` → `main_parser.py` (argparse). Subcommand modules: | `login.py` | Authentication | | `pool.py`, `resources.py`, `user.py`, `credential.py`, `access_token.py`, `bucket.py`, `task.py`, `version.py` | Supporting commands | | `backend.py` | Backend cluster management | +| `standalone.py` | Standalone workflow execution via Docker (`osmo standalone run`) | +| `docker_compose.py` | Parallel workflow execution via Docker Compose (`osmo docker-compose run`) | Features: Tab completion (shtab), response formatting (`formatters.py`), spec editor (`editor.py`), PyInstaller packaging (`cli_builder.py`, `packaging/`). diff --git a/cookbook/tutorials/BUILD b/cookbook/tutorials/BUILD new file mode 100644 index 000000000..d56c526f4 --- /dev/null +++ b/cookbook/tutorials/BUILD @@ -0,0 +1,5 @@ +filegroup( + name = "tutorial_specs", + srcs = glob(["*.yaml"]), + visibility = ["//src/utils/tests:__pkg__"], +) diff --git a/src/cli/BUILD b/src/cli/BUILD index 7a9b905ee..9789a9923 100755 --- a/src/cli/BUILD +++ b/src/cli/BUILD @@ -37,6 +37,8 @@ osmo_py_library( "dataset.py", "editor.py", "formatters.py", + "docker_compose.py", + "standalone.py", "login.py", "main_parser.py", "pool.py", @@ -73,6 +75,8 @@ osmo_py_library( "//src/lib/utils:validation", "//src/lib/utils:version", "//src/lib/utils:workflow", + "//src/utils:compose_executor", + "//src/utils:standalone_executor", ], ) diff --git a/src/cli/docker_compose.py b/src/cli/docker_compose.py new file mode 100644 index 000000000..cde0c173f --- /dev/null +++ b/src/cli/docker_compose.py @@ -0,0 +1,127 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import argparse +import os +import sys + +import shtab + +from src.utils import compose_executor + + +def setup_parser(parser: argparse._SubParsersAction): + """Register the 'docker-compose' subcommand for parallel workflow execution.""" + dc_parser = parser.add_parser( + 'docker-compose', + help='Run workflows using Docker Compose for parallel execution ' + '(no Kubernetes cluster required).') + subparsers = dc_parser.add_subparsers(dest='command') + subparsers.required = True + + run_parser = subparsers.add_parser( + 'run', + help='Execute a workflow spec using Docker Compose for parallel task execution.') + run_parser.add_argument( + '-f', '--file', + required=True, + dest='workflow_file', + help='Path to the workflow YAML spec file.').complete = shtab.FILE + run_parser.add_argument( + '--work-dir', + dest='work_dir', + default=None, + help='Directory for task inputs/outputs and the generated docker-compose.yml. ' + 'Defaults to a temporary directory.') + run_parser.add_argument( + '--keep', + action='store_true', + default=False, + help='Keep the work directory after execution (always kept on failure).') + run_parser.add_argument( + '--compose-cmd', + dest='compose_cmd', + default='docker compose', + help='Docker Compose command to use (e.g. "docker-compose" for V1). ' + 'Default: "docker compose".') + run_parser.add_argument( + '--credential', + nargs='+', + default=[], + help='Map credential names to local directories. ' + 'Format: "=". The directory is bind-mounted read-only ' + 'into the container at the path declared in the spec. ' + 'Example: --credential hf-token=$HOME/.hf') + run_parser.add_argument( + '--set', + nargs='+', + default=[], + help='Override default-values in the workflow spec. ' + 'Format: "=". Values are cast as int or float if ' + 'applicable, otherwise kept as strings.') + run_parser.add_argument( + '--set-string', + dest='set_string', + nargs='+', + default=[], + help='Override default-values in the workflow spec, forcing string type. ' + 'Format: "=".') + run_parser.add_argument( + '--shm-size', + dest='shm_size', + default=None, + help='Shared memory size for GPU containers (e.g. 16g, 32g). ' + 'Defaults to 16g for tasks that request GPUs.') + run_parser.set_defaults(func=_run_compose) + + +def _parse_credentials(raw_credentials: list[str]) -> dict[str, str]: + """Parse --credential name=path arguments into a dict.""" + result: dict[str, str] = {} + for entry in raw_credentials: + if '=' not in entry: + raise ValueError( + f'--credential value "{entry}" is incorrectly formatted (expected name=/path)') + name, path = entry.split('=', 1) + if not os.path.isdir(path): + raise ValueError( + f'Credential path for "{name}" does not exist or is not a directory: {path}') + result[name] = path + return result + + +def _run_compose(service_client, args: argparse.Namespace): + """Execute a workflow via Docker Compose using the parsed CLI arguments.""" + try: + credentials = _parse_credentials(args.credential) + success = compose_executor.run_workflow_compose( + spec_path=args.workflow_file, + work_dir=args.work_dir, + keep_work_dir=args.keep, + compose_cmd=args.compose_cmd, + shm_size=args.shm_size, + set_variables=args.set, + set_string_variables=args.set_string, + credentials=credentials, + ) + except (ValueError, FileNotFoundError, PermissionError) as error: + print(f'Error: {error}', file=sys.stderr) + sys.exit(1) + + if not success: + sys.exit(1) diff --git a/src/cli/main_parser.py b/src/cli/main_parser.py index 79484ee16..654673923 100644 --- a/src/cli/main_parser.py +++ b/src/cli/main_parser.py @@ -28,6 +28,8 @@ credential, data, dataset, + docker_compose, + standalone, login, pool, profile, @@ -55,7 +57,9 @@ profile.setup_parser, pool.setup_parser, user.setup_parser, - config.setup_parser + config.setup_parser, + standalone.setup_parser, + docker_compose.setup_parser, ) diff --git a/src/cli/standalone.py b/src/cli/standalone.py new file mode 100644 index 000000000..8c3c200f7 --- /dev/null +++ b/src/cli/standalone.py @@ -0,0 +1,140 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import argparse +import os +import sys + +import shtab + +from src.utils import standalone_executor + + +def setup_parser(parser: argparse._SubParsersAction): + """Register the 'standalone' subcommand and its nested 'run' action with the CLI argument parser.""" + standalone_parser = parser.add_parser( + 'standalone', + help='Run workflows in standalone mode using Docker containers (no Kubernetes cluster required).') + subparsers = standalone_parser.add_subparsers(dest='command') + subparsers.required = True + + run_parser = subparsers.add_parser( + 'run', + help='Execute a workflow spec in standalone mode using Docker containers.') + run_parser.add_argument( + '-f', '--file', + required=True, + dest='workflow_file', + help='Path to the workflow YAML spec file.').complete = shtab.FILE + run_parser.add_argument( + '--work-dir', + dest='work_dir', + default=None, + help='Directory for task inputs/outputs. Defaults to a temporary directory.') + run_parser.add_argument( + '--keep', + action='store_true', + default=False, + help='Keep the work directory after execution (always kept on failure).') + run_parser.add_argument( + '--docker', + dest='docker_cmd', + default='docker', + help='Docker-compatible command to use (e.g. podman). Default: docker.') + run_parser.add_argument( + '--resume', + action='store_true', + default=False, + help='Resume a previous run, skipping tasks that already completed successfully. ' + 'Requires --work-dir pointing to the previous run directory.') + run_parser.add_argument( + '--from-step', + dest='from_step', + default=None, + help='Resume from a specific task, re-running it and all downstream tasks. ' + 'Tasks upstream of the specified step are skipped if they completed ' + 'successfully. Requires --work-dir pointing to the previous run directory.') + run_parser.add_argument( + '--credential', + nargs='+', + default=[], + help='Map credential names to local directories. ' + 'Format: "=". The directory is bind-mounted read-only ' + 'into the container at the path declared in the spec. ' + 'Example: --credential hf-token=$HOME/.hf') + run_parser.add_argument( + '--set', + nargs='+', + default=[], + help='Override default-values in the workflow spec. ' + 'Format: "=". Values are cast as int or float if ' + 'applicable, otherwise kept as strings.') + run_parser.add_argument( + '--set-string', + dest='set_string', + nargs='+', + default=[], + help='Override default-values in the workflow spec, forcing string type. ' + 'Format: "=".') + run_parser.add_argument( + '--shm-size', + dest='shm_size', + default=None, + help='Shared memory size for GPU containers (e.g. 16g, 32g). ' + 'Defaults to 16g for tasks that request GPUs. ' + 'PyTorch DataLoader workers require large shared memory.') + run_parser.set_defaults(func=_run_standalone) + + +def _parse_credentials(raw_credentials: list[str]) -> dict[str, str]: + """Parse --credential name=path arguments into a dict.""" + result: dict[str, str] = {} + for entry in raw_credentials: + if '=' not in entry: + raise ValueError( + f'--credential value "{entry}" is incorrectly formatted (expected name=/path)') + name, path = entry.split('=', 1) + if not os.path.isdir(path): + raise ValueError( + f'Credential path for "{name}" does not exist or is not a directory: {path}') + result[name] = path + return result + + +def _run_standalone(service_client, args: argparse.Namespace): + """Execute a workflow in standalone mode via Docker using the parsed CLI arguments.""" + try: + credentials = _parse_credentials(args.credential) + success = standalone_executor.run_workflow_standalone( + spec_path=args.workflow_file, + work_dir=args.work_dir, + keep_work_dir=args.keep, + resume=args.resume, + from_step=args.from_step, + docker_cmd=args.docker_cmd, + shm_size=args.shm_size, + set_variables=args.set, + set_string_variables=args.set_string, + credentials=credentials, + ) + except (ValueError, FileNotFoundError, PermissionError) as error: + print(f'Error: {error}', file=sys.stderr) + sys.exit(1) + + if not success: + sys.exit(1) diff --git a/src/utils/BUILD b/src/utils/BUILD index 77f45aafb..a6674a2f3 100644 --- a/src/utils/BUILD +++ b/src/utils/BUILD @@ -126,3 +126,26 @@ osmo_py_library( ], visibility = ["//visibility:public"], ) + +osmo_py_library( + name = "standalone_executor", + srcs = ["standalone_executor.py"], + deps = [ + requirement("jinja2"), + requirement("pyyaml"), + "//src/lib/utils:workflow", + "//src/utils/job", + ], + visibility = ["//visibility:public"], +) + +osmo_py_library( + name = "compose_executor", + srcs = ["compose_executor.py"], + deps = [ + requirement("pyyaml"), + "//src/utils:standalone_executor", + "//src/utils/job", + ], + visibility = ["//visibility:public"], +) diff --git a/src/utils/compose_executor.py b/src/utils/compose_executor.py new file mode 100644 index 000000000..97f490f48 --- /dev/null +++ b/src/utils/compose_executor.py @@ -0,0 +1,564 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import json +import logging +import os +import re +import shlex +import shutil +import subprocess +import tempfile +from typing import Dict, List + +import yaml + +from src.utils.job import task as task_module +from src.utils.job import workflow as workflow_module +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + StandaloneExecutor, + TaskNode, + TaskResult, + _expand_jinja_locally, + _spec_has_templates, +) + + +logger = logging.getLogger(__name__) + +COMPOSE_FILE_NAME = 'docker-compose.yml' + + +class ComposeExecutor(StandaloneExecutor): + """ + Executes an OSMO workflow spec using Docker Compose for parallel task execution. + + Extends StandaloneExecutor with: + - True parallel execution of independent tasks within each scheduling wave + - {{host:taskname}} token support via Docker Compose DNS + - Shared network per task group for gang-scheduled communication + - GPU passthrough via compose deploy.resources.reservations + + Execution model: + Generates a single docker-compose.yml with all services defined up-front, + then executes them in waves. Each wave contains all tasks whose upstream + dependencies are satisfied. Tasks within a wave run in parallel via + ``docker compose up``. Group co-scheduling is enforced so that all members + of a multi-task group start together in the same wave. + """ + + def __init__(self, work_dir: str, keep_work_dir: bool = False, + compose_cmd: str = 'docker compose', shm_size: str | None = None, + credentials: Dict[str, str] | None = None): + super().__init__(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd='docker', shm_size=shm_size, + credentials=credentials) + self._compose_cmd = compose_cmd + + @property + def _compose_file_path(self) -> str: + return os.path.join(self._work_dir, COMPOSE_FILE_NAME) + + def _compose_project_name(self, spec: workflow_module.WorkflowSpec) -> str: + sanitized = re.sub(r'[^a-z0-9-]', '-', spec.name.lower()) + sanitized = re.sub(r'-{2,}', '-', sanitized).strip('-') + return f'osmo-{sanitized}' if sanitized else 'osmo-default' + + def _compose_base_cmd(self, spec: workflow_module.WorkflowSpec) -> List[str]: + return ( + shlex.split(self._compose_cmd) + + ['-p', self._compose_project_name(spec), '-f', self._compose_file_path] + ) + + # ------------------------------------------------------------------ + # Execution + # ------------------------------------------------------------------ + + def execute(self, spec: workflow_module.WorkflowSpec, + resume: bool = False, from_step: str | None = None) -> bool: + """Run all tasks in wave-parallel order via Docker Compose.""" + if resume or from_step: + raise NotImplementedError( + 'docker-compose mode does not support --resume or --from-step yet. ' + 'Use standalone mode for resume functionality.') + self._results.clear() + self._build_dag(spec) + self._validate_for_compose(spec) + self._setup_directories() + self._write_inline_files(spec) + self._generate_compose_file(spec) + + total_tasks = sum(len(g.tasks) for g in self._groups(spec)) + logger.info('Workflow "%s": %d task(s) across %d group(s) [docker-compose mode]', + spec.name, total_tasks, len(self._groups(spec))) + + try: + wave_number = 0 + while True: + wave = self._find_ready_wave() + if not wave: + break + + wave_number += 1 + logger.info('=== Wave %d: %s ===', wave_number, ', '.join(wave)) + + wave_results = self._run_wave(wave, spec) + + fatal_failure = False + for task_name, exit_code in wave_results.items(): + output_dir = os.path.join(self._work_dir, task_name, 'output') + self._results[task_name] = TaskResult( + name=task_name, exit_code=exit_code, output_dir=output_dir) + + if exit_code != 0: + if self._is_nonlead_failure_ignorable(task_name): + logger.warning( + 'Non-lead task "%s" failed with exit code %d ' + '(ignored — group "%s" has ignoreNonleadStatus=true)', + task_name, exit_code, self._task_nodes[task_name].group) + else: + logger.error('Task "%s" failed with exit code %d', + task_name, exit_code) + self._cancel_downstream(task_name) + fatal_failure = True + else: + logger.info('Task "%s" completed successfully', task_name) + + if fatal_failure: + return False + + unexecuted = set(self._task_nodes.keys()) - set(self._results.keys()) + if unexecuted: + logger.error( + 'Workflow "%s" stalled — tasks could not be scheduled ' + '(possible cycle or unsatisfiable group): %s', + spec.name, ', '.join(sorted(unexecuted))) + return False + + fatal_failures = [ + name for name, result in self._results.items() + if result.exit_code != 0 + and not self._is_nonlead_failure_ignorable(name) + ] + if fatal_failures: + logger.error('Workflow failed. Failed tasks: %s', + ', '.join(fatal_failures)) + return False + + logger.info('Workflow "%s" completed successfully', spec.name) + return True + finally: + self._compose_cleanup(spec) + + # ------------------------------------------------------------------ + # Validation + # ------------------------------------------------------------------ + + def _validate_for_compose(self, spec: workflow_module.WorkflowSpec): + """Reject cluster-only features while allowing {{host:}} tokens.""" + unsupported_features: List[str] = [] + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.DatasetInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": dataset inputs require object storage') + elif isinstance(input_source, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL inputs require network/storage access') + + for output in task_spec.outputs: + if isinstance(output, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL outputs require object storage') + elif isinstance(output, task_module.DatasetInputOutput): + logger.info( + 'Task "%s": dataset output "%s" ignored in docker-compose mode ' + '— data is available in the work directory', + task_spec.name, output.dataset.name) + + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" uses ' + f'dict-style mapping which is not supported in docker-compose ' + f'mode; provide credentials as NAME=/path') + elif cred_name not in self._credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' + f'Use --credential {cred_name}=/path/to/dir') + + if task_spec.checkpoint: + unsupported_features.append( + f'Task "{task_spec.name}": checkpoints require object storage') + + if task_spec.volumeMounts: + unsupported_features.append( + f'Task "{task_spec.name}": volumeMounts require cluster-level host paths') + + if task_spec.privileged: + unsupported_features.append( + f'Task "{task_spec.name}": privileged containers are not ' + f'supported in docker-compose mode') + + if task_spec.hostNetwork: + unsupported_features.append( + f'Task "{task_spec.name}": hostNetwork is not supported ' + f'in docker-compose mode') + + self._validate_host_tokens(task_spec, group) + + if unsupported_features: + raise ValueError( + 'The following features are not supported in docker-compose ' + 'execution mode:\n - ' + + '\n - '.join(unsupported_features)) + + _HOST_TOKEN_NAME_PATTERN = re.compile(r'\{\{\s*host:(\S+)\s*\}\}') + + def _validate_host_tokens(self, task_spec: task_module.TaskSpec, + group: task_module.TaskGroupSpec): + """Ensure {{host:taskname}} tokens only reference tasks in the same group.""" + group_task_names = {t.name for t in group.tasks} + fields_to_check = list(task_spec.command) + list(task_spec.args) + fields_to_check += list(task_spec.environment.values()) + fields_to_check += [file_spec.contents for file_spec in task_spec.files] + + for field in fields_to_check: + for match in self._HOST_TOKEN_NAME_PATTERN.finditer(field): + referenced_task = match.group(1) + if referenced_task not in group_task_names: + raise ValueError( + f'Task "{task_spec.name}": {{{{host:{referenced_task}}}}} ' + f'references a task outside its group "{group.name}". ' + f'Host tokens can only reference tasks within the same group.') + + # ------------------------------------------------------------------ + # Token map (extended with {{host:taskname}}) + # ------------------------------------------------------------------ + + def _build_token_map(self, node: TaskNode) -> Dict[str, str]: + tokens = super()._build_token_map(node) + group_spec = self._group_specs[node.group] + for task_spec in group_spec.tasks: + tokens[f'host:{task_spec.name}'] = task_spec.name + return tokens + + # ------------------------------------------------------------------ + # Inline files + # ------------------------------------------------------------------ + + def _write_inline_files(self, spec: workflow_module.WorkflowSpec): + """Write all inline file specs to disk with token substitution.""" + for group in self._groups(spec): + for task_spec in group.tasks: + node = self._task_nodes[task_spec.name] + token_map = self._build_token_map(node) + files_dir = os.path.join(self._work_dir, task_spec.name, 'files') + os.makedirs(files_dir, exist_ok=True) + + for file_spec in task_spec.files: + resolved_contents = self._substitute_tokens( + file_spec.contents, token_map) + host_path = os.path.realpath( + os.path.join(files_dir, file_spec.path.lstrip('/'))) + if not host_path.startswith(os.path.realpath(files_dir) + os.sep): + raise ValueError( + f'Task "{task_spec.name}": file path ' + f'"{file_spec.path}" escapes the task directory') + os.makedirs(os.path.dirname(host_path), exist_ok=True) + with open(host_path, 'w', encoding='utf-8') as f: + f.write(resolved_contents) + + # ------------------------------------------------------------------ + # Compose file generation + # ------------------------------------------------------------------ + + def _generate_compose_file(self, spec: workflow_module.WorkflowSpec): + """Write a docker-compose.yml containing every task as a service.""" + compose: Dict = {'services': {}} + networks_needed: set = set() + + for task_name, node in self._task_nodes.items(): + service = self._build_compose_service(node, spec) + compose['services'][task_name] = service + networks_needed.add(node.group) + + if networks_needed: + compose['networks'] = { + name: {'driver': 'bridge'} + for name in sorted(networks_needed) + } + + with open(self._compose_file_path, 'w', encoding='utf-8') as f: + yaml.safe_dump(compose, f, default_flow_style=False, sort_keys=False) + + logger.info('Generated compose file: %s', self._compose_file_path) + + @staticmethod + def _escape_compose_interpolation(text: str) -> str: + """Escape ``$`` as ``$$`` so Docker Compose passes them literally to the container.""" + return text.replace('$', '$$') + + def _build_compose_service(self, node: TaskNode, + spec: workflow_module.WorkflowSpec) -> Dict: + """Build a single Docker Compose service definition for a task.""" + task_spec = node.spec + token_map = self._build_token_map(node) + + resolved_command = [ + self._substitute_tokens(c, token_map) for c in task_spec.command] + resolved_args = [ + self._substitute_tokens(a, token_map) for a in task_spec.args] + resolved_environment = { + key: self._substitute_tokens(value, token_map) + for key, value in task_spec.environment.items() + } + + all_resolved = ( + resolved_command + resolved_args + list(resolved_environment.values()) + + [self._substitute_tokens(f.contents, token_map) + for f in task_spec.files] + ) + self._check_unresolved_tokens(node.name, all_resolved) + + esc = self._escape_compose_interpolation + + service: Dict = {'image': task_spec.image} + + if resolved_command: + service['entrypoint'] = [esc(resolved_command[0])] + trailing = resolved_command[1:] + resolved_args + if trailing: + service['command'] = [esc(t) for t in trailing] + elif resolved_args: + service['command'] = [esc(a) for a in resolved_args] + + if resolved_environment: + service['environment'] = { + k: esc(v) for k, v in resolved_environment.items() + } + + volumes: List[str] = [] + task_dir = os.path.abspath(os.path.join(self._work_dir, node.name)) + output_dir = os.path.join(task_dir, 'output') + volumes.append(f'{output_dir}:{CONTAINER_DATA_PATH}/output') + + for index, input_source in enumerate(task_spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + upstream_output = os.path.abspath( + os.path.join(self._work_dir, input_source.task, 'output')) + volumes.append( + f'{upstream_output}:{CONTAINER_DATA_PATH}/input/{index}:ro') + + files_dir = os.path.join(task_dir, 'files') + for file_spec in task_spec.files: + host_path = os.path.realpath( + os.path.join(files_dir, file_spec.path.lstrip('/'))) + volumes.append(f'{host_path}:{file_spec.path}:ro') + + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + raise ValueError( + f'Task "{node.name}": credential "{cred_name}" uses dict-style ' + f'mapping which is not supported in docker-compose mode; ' + f'provide credentials as NAME=/path') + if isinstance(cred_mount, str) and cred_name in self._credentials: + local_dir = os.path.abspath(self._credentials[cred_name]) + volumes.append(f'{local_dir}:{cred_mount}:ro') + + if volumes: + service['volumes'] = volumes + + service['networks'] = [node.group] + + gpu_count = self._task_gpu_count(task_spec, spec) + if gpu_count > 0: + service['deploy'] = { + 'resources': { + 'reservations': { + 'devices': [{ + 'driver': 'nvidia', + 'count': gpu_count, + 'capabilities': ['gpu'], + }] + } + } + } + service['shm_size'] = self._shm_size or self.DEFAULT_SHM_SIZE + elif self._shm_size: + service['shm_size'] = self._shm_size + + return service + + # ------------------------------------------------------------------ + # Wave scheduling + # ------------------------------------------------------------------ + + def _find_ready_wave(self) -> List[str]: + """ + Return the next batch of tasks to run in parallel. + + All members of a multi-task group are co-scheduled: a group is only + included when every unfinished member has its upstream dependencies + satisfied. If co-scheduling stalls (e.g. cross-group edges inside a + multi-task group), we fall back to plain task-level readiness to avoid + deadlocks. + """ + ready_tasks = self._find_ready_tasks() + if not ready_tasks: + return [] + + ready_set = set(ready_tasks) + + groups_with_ready: Dict[str, List[str]] = {} + for task_name in ready_tasks: + group = self._task_nodes[task_name].group + groups_with_ready.setdefault(group, []).append(task_name) + + wave: List[str] = [] + for group_name, group_ready in groups_with_ready.items(): + group_spec = self._group_specs[group_name] + all_members = {t.name for t in group_spec.tasks} + unfinished = all_members - set(self._results.keys()) + + if unfinished.issubset(ready_set): + wave.extend(sorted(unfinished)) + elif len(all_members) == 1: + wave.extend(group_ready) + + if not wave and ready_tasks: + wave = ready_tasks + + return wave + + # ------------------------------------------------------------------ + # Wave execution + # ------------------------------------------------------------------ + + def _run_wave(self, task_names: List[str], + spec: workflow_module.WorkflowSpec) -> Dict[str, int]: + """Start *task_names* in parallel and block until they all exit.""" + base_cmd = self._compose_base_cmd(spec) + + up_cmd = base_cmd + ['up', '--no-deps', '--no-log-prefix'] + list(task_names) + logger.debug('Compose command: %s', ' '.join(up_cmd)) + + try: + subprocess.run(up_cmd, check=False) + except FileNotFoundError: + logger.error( + 'Docker Compose not found. Is "%s" available in your PATH?', + self._compose_cmd) + return {name: 127 for name in task_names} + + results: Dict[str, int] = {} + for task_name in task_names: + results[task_name] = self._get_service_exit_code(task_name, spec) + + rm_cmd = base_cmd + ['rm', '-f'] + list(task_names) + subprocess.run(rm_cmd, capture_output=True, check=False) + + return results + + def _get_service_exit_code(self, service_name: str, + spec: workflow_module.WorkflowSpec) -> int: + """Query Docker Compose for the exit code of *service_name*.""" + ps_cmd = self._compose_base_cmd(spec) + [ + 'ps', '-a', '--format', 'json', service_name, + ] + try: + result = subprocess.run( + ps_cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + logger.warning('Failed to query exit code for "%s": %s', + service_name, result.stderr.strip()) + return 1 + + for line in result.stdout.strip().splitlines(): + line = line.strip() + if not line: + continue + try: + container_info = json.loads(line) + except json.JSONDecodeError: + logger.error('Failed to parse container info line as JSON: %s', line, exc_info=True) + continue + if isinstance(container_info, list): + for entry in container_info: + if entry.get('Service') == service_name: + return entry.get('ExitCode', 1) + elif container_info.get('Service') == service_name: + return container_info.get('ExitCode', 1) + + logger.warning( + 'No container info found for service "%s" in docker compose output:\n%s', + service_name, result.stdout.strip()) + return 1 + except (subprocess.TimeoutExpired, FileNotFoundError): + logger.warning('Could not determine exit code for "%s"', service_name) + return 1 + + # ------------------------------------------------------------------ + # Cleanup + # ------------------------------------------------------------------ + + def _compose_cleanup(self, spec: workflow_module.WorkflowSpec): + """Tear down containers and networks created by Docker Compose.""" + down_cmd = self._compose_base_cmd(spec) + ['down', '--remove-orphans'] + try: + subprocess.run(down_cmd, capture_output=True, timeout=60, check=False) + except (subprocess.TimeoutExpired, FileNotFoundError): + logger.warning('Failed to clean up Docker Compose resources') + + +def run_workflow_compose(spec_path: str, work_dir: str | None = None, + keep_work_dir: bool = False, + compose_cmd: str = 'docker compose', + shm_size: str | None = None, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None, + credentials: Dict[str, str] | None = None) -> bool: + """Load a workflow spec and execute it via Docker Compose.""" + with open(spec_path, encoding='utf-8') as f: + spec_text = f.read() + + if _spec_has_templates(spec_text): + logger.info('Spec contains Jinja templates — expanding locally') + spec_text = _expand_jinja_locally(spec_text, set_variables, set_string_variables) + + created_work_dir = work_dir is None + if work_dir is None: + work_dir = tempfile.mkdtemp(prefix='osmo-compose-') + logger.info('Using temporary work directory: %s', work_dir) + + success = False + try: + executor = ComposeExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + compose_cmd=compose_cmd, shm_size=shm_size, + credentials=credentials) + spec = executor.load_spec(spec_text) + success = executor.execute(spec) + finally: + if created_work_dir and not keep_work_dir and success: + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) + + return success diff --git a/src/utils/standalone_executor.py b/src/utils/standalone_executor.py new file mode 100644 index 000000000..7aba35fbf --- /dev/null +++ b/src/utils/standalone_executor.py @@ -0,0 +1,778 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import base64 as base64_module +import dataclasses +import hashlib +import json +import logging +import os +import re +import shutil +import subprocess +import tempfile +import time +from collections import deque +from typing import Any, Dict, List, Set + +import jinja2 +import jinja2.sandbox +import yaml + +from src.lib.utils import workflow as workflow_utils +from src.utils.job import task as task_module +from src.utils.job import workflow as workflow_module + + +logger = logging.getLogger(__name__) + +STATE_FILE_NAME = '.osmo-state.json' +CONTAINER_DATA_PATH = '/osmo/data' + + +@dataclasses.dataclass +class TaskNode: + """A node in the workflow DAG, linking a task spec to its upstream and downstream dependencies.""" + + name: str + spec: task_module.TaskSpec + group: str + upstream: Set[str] = dataclasses.field(default_factory=set) + downstream: Set[str] = dataclasses.field(default_factory=set) + + +@dataclasses.dataclass +class TaskResult: + """Outcome of a single task execution, capturing its exit code and output directory path.""" + + name: str + exit_code: int + output_dir: str + + +class StandaloneExecutor: + """ + Executes an OSMO workflow spec in standalone mode using Docker, without Kubernetes. + + Supports: + - Serial and parallel task DAGs + - Task groups with lead-task failure policy (ignoreNonleadStatus) + - {{output}} and {{input:N}} / {{input:taskname}} token substitution + - Inline `files:` written to the container + - `environment:` passed as Docker env vars + - Task-to-task data flow via shared local directories + - GPU passthrough via --gpus for tasks that declare gpu > 0 in resources + - Credentials via --credential NAME=/path (mounted read-only in _run_task) + - Jinja-templated specs (expanded locally via _expand_jinja_locally) + + Does NOT support (raises clear errors): + - Dataset / URL inputs/outputs (require object storage) + - Checkpoints, volumeMounts (require cluster infra) + - {{host:taskname}} tokens (require parallel containers with shared networking) + """ + + DEFAULT_SHM_SIZE = '16g' + + def __init__(self, work_dir: str, keep_work_dir: bool = False, docker_cmd: str = 'docker', + shm_size: str | None = None, + credentials: Dict[str, str] | None = None): + """Initialize the executor with a work directory, cleanup preference, and container runtime command.""" + self._work_dir = work_dir + self._keep_work_dir = keep_work_dir + self._docker_cmd = docker_cmd + self._shm_size = shm_size + self._credentials = credentials or {} + self._task_nodes: Dict[str, TaskNode] = {} + self._group_specs: Dict[str, task_module.TaskGroupSpec] = {} + self._results: Dict[str, TaskResult] = {} + self._available_gpus: int | None = None + self._workflow_fingerprint: str = '' + + def _detect_available_gpus(self) -> int: + """Query nvidia-smi to count available GPUs, caching the result for subsequent calls.""" + if self._available_gpus is not None: + return self._available_gpus + try: + result = subprocess.run( + ['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + gpu_indices = [line.strip() for line in result.stdout.strip().splitlines() if line.strip()] + self._available_gpus = len(gpu_indices) + else: + logger.warning('nvidia-smi failed (exit %d) — assuming 0 GPUs available', result.returncode) + self._available_gpus = 0 + except FileNotFoundError: + logger.warning('nvidia-smi not found — assuming 0 GPUs available') + self._available_gpus = 0 + except subprocess.TimeoutExpired: + logger.warning('nvidia-smi timed out — assuming 0 GPUs available') + self._available_gpus = 0 + return self._available_gpus + + def _compute_workflow_fingerprint(self, spec: workflow_module.WorkflowSpec) -> str: + """Compute a deterministic SHA-256 hash from task specs to detect workflow changes across runs.""" + fingerprint_data: List[Dict[str, Any]] = [] + for name in sorted(self._task_nodes): + task_spec = self._task_nodes[name].spec + files_data = [ + {'path': f.path, 'contents': f.contents, 'base64': f.base64} + for f in sorted(task_spec.files, key=lambda f: f.path) + ] if task_spec.files else [] + named_resource = spec.resources.get(task_spec.resource) + named_resource_dict = named_resource.model_dump(exclude_none=True) if named_resource else {} + inline_resource_dict = task_spec.resources.model_dump(exclude_defaults=True) + effective_resource = {**named_resource_dict, **inline_resource_dict} + credentials_data = { + cred_name: { + 'mount_path': cred_mount, + 'source': os.path.abspath(self._credentials[cred_name]) + if cred_name in self._credentials else None, + } + for cred_name, cred_mount in sorted(task_spec.credentials.items()) + } if task_spec.credentials else {} + fingerprint_data.append({ + 'name': name, + 'image': task_spec.image, + 'command': task_spec.command, + 'args': task_spec.args, + 'environment': dict(sorted(task_spec.environment.items())), + 'inputs': [str(i) for i in task_spec.inputs], + 'resource': task_spec.resource, + 'resource_config': effective_resource, + 'files': files_data, + 'credentials': credentials_data, + }) + blob = json.dumps(fingerprint_data, sort_keys=True, separators=(',', ':')) + return hashlib.sha256(blob.encode('utf-8')).hexdigest() + + def load_spec(self, spec_text: str) -> workflow_module.WorkflowSpec: + """Parse raw YAML text into a validated WorkflowSpec via the versioned spec model.""" + raw = yaml.safe_load(spec_text) + if not isinstance(raw, dict): + raise ValueError( + f'Expected a YAML mapping for the workflow spec, ' + f'got {type(raw).__name__}') + versioned = workflow_module.VersionedWorkflowSpec(**raw) + return versioned.workflow + + def execute(self, spec: workflow_module.WorkflowSpec, + resume: bool = False, from_step: str | None = None) -> bool: + """Run all tasks in topological order, returning True if the entire workflow succeeds.""" + self._results.clear() + self._build_dag(spec) + self._workflow_fingerprint = self._compute_workflow_fingerprint(spec) + self._validate_for_standalone(spec) + + if resume or from_step: + self._restore_completed_tasks(from_step) + + self._setup_directories() + self._clean_rerun_output_dirs() + + total_tasks = sum(len(g.tasks) for g in self._groups(spec)) + skipped = len(self._results) + remaining = total_tasks - skipped + if skipped > 0: + logger.info('Workflow "%s": resuming — %d task(s) skipped, %d remaining', + spec.name, skipped, remaining) + else: + logger.info('Workflow "%s": %d task(s) across %d group(s)', + spec.name, total_tasks, len(self._groups(spec))) + + ready = self._find_ready_tasks() + while ready: + for task_name in ready: + node = self._task_nodes[task_name] + logger.info('--- Running task: %s (image: %s) ---', task_name, node.spec.image) + result = self._run_task(node, spec) + self._results[task_name] = result + self._save_state() + + if result.exit_code != 0: + if self._is_nonlead_failure_ignorable(task_name): + logger.warning( + 'Non-lead task "%s" failed with exit code %d ' + '(ignored — group "%s" has ignoreNonleadStatus=true)', + task_name, result.exit_code, node.group) + else: + logger.error('Task "%s" failed with exit code %d', task_name, result.exit_code) + self._cancel_downstream(task_name) + return False + else: + logger.info('Task "%s" completed successfully', task_name) + + ready = self._find_ready_tasks() + + unexecuted = set(self._task_nodes.keys()) - set(self._results.keys()) + if unexecuted: + logger.error('Workflow "%s" stalled — tasks could not be scheduled (possible cycle): %s', + spec.name, ', '.join(sorted(unexecuted))) + return False + + fatal_failures = [ + name for name, r in self._results.items() + if r.exit_code != 0 and not self._is_nonlead_failure_ignorable(name) + ] + if fatal_failures: + logger.error('Workflow failed. Failed tasks: %s', ', '.join(fatal_failures)) + return False + + logger.info('Workflow "%s" completed successfully', spec.name) + return True + + @property + def _state_file_path(self) -> str: + """Absolute path to the JSON state file used for resume tracking.""" + return os.path.join(self._work_dir, STATE_FILE_NAME) + + def _save_state(self): + """Persist current task results to the state file so runs can be resumed later.""" + state: Dict[str, Any] = { + 'workflow_fingerprint': self._workflow_fingerprint, + 'tasks': { + name: {'exit_code': result.exit_code, 'output_dir': result.output_dir} + for name, result in self._results.items() + if result.exit_code != -1 + }, + } + tmp_path = self._state_file_path + '.tmp' + with open(tmp_path, 'w', encoding='utf-8') as f: + json.dump(state, f, indent=2) + f.flush() + os.fsync(f.fileno()) + os.replace(tmp_path, self._state_file_path) + state_dir = os.path.dirname(self._state_file_path) or '.' + dir_fd = os.open(state_dir, os.O_RDONLY) + try: + os.fsync(dir_fd) + finally: + os.close(dir_fd) + + def _load_state(self) -> Dict | None: + """Load previously saved task state from disk, returning None if no state file exists or if the file is corrupt.""" + if not os.path.exists(self._state_file_path): + return None + try: + with open(self._state_file_path, encoding='utf-8') as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as exc: + corrupt_path = f'{self._state_file_path}.corrupt.{int(time.time())}' + try: + os.rename(self._state_file_path, corrupt_path) + logger.warning( + 'State file is corrupt (%s); renamed to %s and starting fresh', + exc, corrupt_path) + except OSError: + logger.warning( + 'State file is corrupt (%s); starting fresh', exc) + return None + + def _restore_completed_tasks(self, from_step: str | None = None): + """Reload completed tasks from a previous run, optionally invalidating from a given step onward.""" + state = self._load_state() + if state is None: + logger.info('No previous state found — starting from scratch') + return + + saved_fingerprint = state.get('workflow_fingerprint') + if not saved_fingerprint: + logger.warning( + 'State file has no workflow fingerprint — cannot verify ' + 'that the spec matches the previous run; reused outputs may be stale') + elif saved_fingerprint != self._workflow_fingerprint: + logger.warning( + 'Workflow spec has changed since the previous run ' + '(fingerprint %s → %s); reused outputs may be stale', + saved_fingerprint[:12], self._workflow_fingerprint[:12]) + + completed: Dict[str, Dict] = {} + for name, info in state.get('tasks', {}).items(): + if name not in self._task_nodes: + continue + if info['exit_code'] == 0 and os.path.isdir(info['output_dir']): + completed[name] = info + + if from_step: + if from_step not in self._task_nodes: + raise ValueError(f'Task "{from_step}" not found in workflow') + to_invalidate = self._get_downstream_tasks(from_step) + to_invalidate.add(from_step) + for name in to_invalidate: + completed.pop(name, None) + + for name, info in completed.items(): + self._results[name] = TaskResult( + name=name, exit_code=0, output_dir=info['output_dir']) + logger.info('Resuming: skipping completed task "%s"', name) + + def _get_downstream_tasks(self, task_name: str) -> Set[str]: + """Return all transitive downstream dependents of the given task via BFS.""" + visited: Set[str] = set() + queue: deque[str] = deque([task_name]) + while queue: + current = queue.popleft() + for downstream in self._task_nodes[current].downstream: + if downstream not in visited: + visited.add(downstream) + queue.append(downstream) + return visited + + def _clean_rerun_output_dirs(self): + """Remove output directories for tasks that will be re-executed so no stale artifacts remain.""" + tasks_to_rerun = set(self._task_nodes.keys()) - set(self._results.keys()) + for task_name in tasks_to_rerun: + output_dir = os.path.join(self._work_dir, task_name, 'output') + if os.path.isdir(output_dir): + shutil.rmtree(output_dir) + os.makedirs(output_dir, exist_ok=True) + logger.debug('Cleaned output directory for task "%s"', task_name) + + def _groups(self, spec: workflow_module.WorkflowSpec) -> List[task_module.TaskGroupSpec]: + """Return the spec's groups, or synthesize one group per task when groups are absent.""" + if spec.groups: + return spec.groups + return [task_module.TaskGroupSpec(name=t.name, tasks=[t]) for t in spec.tasks] + + def _build_dag(self, spec: workflow_module.WorkflowSpec): + """Construct the internal DAG of TaskNodes from the workflow spec's tasks and input dependencies.""" + self._task_nodes.clear() + self._group_specs.clear() + + for group in self._groups(spec): + self._group_specs[group.name] = group + for task_spec in group.tasks: + self._task_nodes[task_spec.name] = TaskNode( + name=task_spec.name, + spec=task_spec, + group=group.name, + ) + + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.TaskInputOutput): + upstream_task = input_source.task + if upstream_task not in self._task_nodes: + raise ValueError( + f'Task "{task_spec.name}" depends on unknown task "{upstream_task}"') + self._task_nodes[task_spec.name].upstream.add(upstream_task) + self._task_nodes[upstream_task].downstream.add(task_spec.name) + + self._check_for_cycles() + + def _check_for_cycles(self): + """Raise ValueError if the task DAG contains any cycles, reporting the cycle path.""" + UNVISITED, IN_PROGRESS, DONE = 0, 1, 2 + state: Dict[str, int] = {name: UNVISITED for name in self._task_nodes} + path: List[str] = [] + + def visit(name: str) -> List[str] | None: + if state[name] == DONE: + return None + if state[name] == IN_PROGRESS: + cycle_start = path.index(name) + return [*path[cycle_start:], name] + + state[name] = IN_PROGRESS + path.append(name) + for downstream in self._task_nodes[name].downstream: + cycle = visit(downstream) + if cycle is not None: + return cycle + path.pop() + state[name] = DONE + return None + + for name in self._task_nodes: + cycle = visit(name) + if cycle is not None: + raise ValueError( + f'Circular dependency detected: {" -> ".join(cycle)}') + + _HOST_TOKEN_PATTERN = re.compile(r'\{\{\s*host:[^}]+\}\}') + + def _validate_for_standalone(self, spec: workflow_module.WorkflowSpec): + """Raise ValueError if the spec uses features unsupported in standalone mode.""" + unsupported_features = [] + + if spec.timeout.exec_timeout is not None or spec.timeout.queue_timeout is not None: + unsupported_features.append( + 'WorkflowSpec.timeout is not supported in standalone mode; ' + 'use the service executor or remove the timeout') + + for group in self._groups(spec): + for task_spec in group.tasks: + for input_source in task_spec.inputs: + if isinstance(input_source, task_module.DatasetInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": dataset inputs require object storage') + elif isinstance(input_source, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL inputs require network/storage access') + + for output in task_spec.outputs: + if isinstance(output, task_module.URLInputOutput): + unsupported_features.append( + f'Task "{task_spec.name}": URL outputs require object storage') + elif isinstance(output, task_module.DatasetInputOutput): + logger.info( + 'Task "%s": dataset output "%s" ignored in standalone mode ' + '— data is available in the work directory', + task_spec.name, output.dataset.name) + + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" uses ' + f'dict-style mapping which the standalone executor does not ' + f'support; provide credentials as NAME=/path or flatten the ' + f'mapping') + elif cred_name not in self._credentials: + unsupported_features.append( + f'Task "{task_spec.name}": credential "{cred_name}" not provided. ' + f'Use --credential {cred_name}=/path/to/dir') + + if task_spec.checkpoint: + unsupported_features.append( + f'Task "{task_spec.name}": checkpoints require object storage') + + if task_spec.volumeMounts: + unsupported_features.append( + f'Task "{task_spec.name}": volumeMounts require cluster-level host paths') + + if task_spec.privileged: + unsupported_features.append( + f'Task "{task_spec.name}": privileged containers are not supported in standalone mode') + + if task_spec.hostNetwork: + unsupported_features.append( + f'Task "{task_spec.name}": hostNetwork is not supported in standalone mode') + + if self._task_uses_host_tokens(task_spec): + unsupported_features.append( + f'Task "{task_spec.name}": {{{{host:taskname}}}} tokens require ' + f'parallel containers with shared networking') + + if unsupported_features: + raise ValueError( + 'The following features are not supported in standalone execution mode:\n - ' + + '\n - '.join(unsupported_features)) + + def _task_uses_host_tokens(self, task_spec: task_module.TaskSpec) -> bool: + """Return True if any text field in the task spec contains {{host:...}} tokens.""" + fields_to_check = list(task_spec.command) + list(task_spec.args) + fields_to_check += list(task_spec.environment.values()) + fields_to_check += [file_spec.contents for file_spec in task_spec.files] + return any(self._HOST_TOKEN_PATTERN.search(field) for field in fields_to_check) + + def _setup_directories(self): + """Create the work directory and per-task output directories on the host filesystem.""" + os.makedirs(self._work_dir, exist_ok=True) + for task_name in self._task_nodes: + os.makedirs(os.path.join(self._work_dir, task_name, 'output'), exist_ok=True) + + def _is_nonlead_failure_ignorable(self, task_name: str) -> bool: + """Return True if the task is a non-lead task in a group with ignoreNonleadStatus=true.""" + node = self._task_nodes[task_name] + group_spec = self._group_specs[node.group] + return group_spec.ignoreNonleadStatus and not node.spec.lead + + def _is_task_satisfied(self, task_name: str) -> bool: + """Return True if a completed task's result counts as satisfied for downstream scheduling.""" + result = self._results[task_name] + if result.exit_code == 0: + return True + return self._is_nonlead_failure_ignorable(task_name) + + def _find_ready_tasks(self) -> List[str]: + """Return tasks whose upstream dependencies have all been satisfied, in spec declaration order.""" + completed = set(self._results.keys()) + ready = [] + for name, node in self._task_nodes.items(): + if name in completed: + continue + if node.upstream.issubset(completed): + all_upstream_ok = all(self._is_task_satisfied(u) for u in node.upstream) + if all_upstream_ok: + ready.append(name) + return ready + + def _cancel_downstream(self, failed_task: str): + """Mark all transitive downstream tasks of a failed task as cancelled (exit_code -1).""" + visited: Set[str] = set() + queue: deque[str] = deque([failed_task]) + while queue: + current = queue.popleft() + for downstream in self._task_nodes[current].downstream: + if downstream not in visited and downstream not in self._results: + visited.add(downstream) + self._results[downstream] = TaskResult( + name=downstream, exit_code=-1, output_dir='') + queue.append(downstream) + + def _task_gpu_count(self, task_spec: task_module.TaskSpec, + spec: workflow_module.WorkflowSpec) -> int: + """Return the number of GPUs requested by a task's resource spec, defaulting to 0.""" + if task_spec.resources.gpu: + return task_spec.resources.gpu + resource_spec = spec.resources.get(task_spec.resource) + if resource_spec and resource_spec.gpu: + return resource_spec.gpu + return 0 + + def _run_task(self, node: TaskNode, spec: workflow_module.WorkflowSpec) -> TaskResult: + """Execute a single task as a Docker container, mounting inputs/outputs/files and returning the result.""" + task_spec = node.spec + task_dir = os.path.join(self._work_dir, node.name) + output_dir = os.path.join(task_dir, 'output') + files_dir = os.path.join(task_dir, 'files') + os.makedirs(files_dir, exist_ok=True) + + token_map = self._build_token_map(node) + + for file_spec in task_spec.files: + resolved_contents = self._substitute_tokens(file_spec.contents, token_map) + host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) + if not host_path.startswith(os.path.realpath(files_dir) + os.sep): + raise ValueError( + f'Task "{node.name}": file path "{file_spec.path}" escapes the task directory') + os.makedirs(os.path.dirname(host_path), exist_ok=True) + if file_spec.base64: + with open(host_path, 'wb') as binary_file: + binary_file.write(base64_module.b64decode(resolved_contents)) + else: + with open(host_path, 'w', encoding='utf-8') as text_file: + text_file.write(resolved_contents) + + resolved_command = [self._substitute_tokens(c, token_map) for c in task_spec.command] + resolved_args = [self._substitute_tokens(a, token_map) for a in task_spec.args] + resolved_env_values = [self._substitute_tokens(v, token_map) for v in task_spec.environment.values()] + + all_resolved = resolved_command + resolved_args + resolved_env_values + all_resolved += [self._substitute_tokens(f.contents, token_map) for f in task_spec.files] + self._check_unresolved_tokens(node.name, all_resolved) + + docker_args = [self._docker_cmd, 'run', '--rm'] + + gpu_count = self._task_gpu_count(task_spec, spec) + if gpu_count > 0: + available = self._detect_available_gpus() + if available == 0: + logger.warning( + 'Task "%s" requests %d GPU(s) but no GPUs are available — running without GPU support', + node.name, gpu_count) + elif gpu_count >= available: + if gpu_count > available: + logger.warning( + 'Task "%s" requests %d GPU(s) but only %d available — running with %d GPU(s)', + node.name, gpu_count, available, available) + docker_args += ['--gpus', 'all'] + else: + docker_args += ['--gpus', f'"device={",".join(str(i) for i in range(gpu_count))}"'] + logger.info('Task "%s" requesting %d GPU(s), using %d', node.name, gpu_count, min(gpu_count, available)) + + docker_args += ['--shm-size', self._shm_size or self.DEFAULT_SHM_SIZE] + elif self._shm_size: + docker_args += ['--shm-size', self._shm_size] + + for env_key, resolved_value in zip(task_spec.environment.keys(), resolved_env_values, strict=True): + docker_args += ['-e', f'{env_key}={resolved_value}'] + + docker_args += ['-v', f'{output_dir}:{CONTAINER_DATA_PATH}/output'] + + for index, input_source in enumerate(task_spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + upstream_result = self._results[input_source.task] + docker_args += ['-v', f'{upstream_result.output_dir}:{CONTAINER_DATA_PATH}/input/{index}:ro'] + + for file_spec in task_spec.files: + host_path = os.path.realpath(os.path.join(files_dir, file_spec.path.lstrip('/'))) + docker_args += ['-v', f'{host_path}:{file_spec.path}:ro'] + + for cred_name, cred_mount in task_spec.credentials.items(): + if isinstance(cred_mount, dict): + raise ValueError( + f'Task "{node.name}": credential "{cred_name}" uses dict-style ' + f'mapping which the standalone executor does not support; ' + f'provide credentials as NAME=/path or flatten the mapping') + if isinstance(cred_mount, str) and cred_name in self._credentials: + local_dir = os.path.abspath(self._credentials[cred_name]) + docker_args += ['-v', f'{local_dir}:{cred_mount}:ro'] + + if resolved_command: + docker_args += ['--entrypoint', resolved_command[0]] + docker_args.append(task_spec.image) + docker_args += resolved_command[1:] + resolved_args + + if logger.isEnabledFor(logging.DEBUG): + redacted_args = [] + skip_next = False + for arg in docker_args: + if skip_next: + redacted_args.append(arg.split('=', 1)[0] + '=REDACTED') + skip_next = False + elif arg == '-e': + redacted_args.append(arg) + skip_next = True + else: + redacted_args.append(arg) + logger.debug('Docker command: %s', ' '.join(redacted_args)) + + try: + process = subprocess.run(docker_args, capture_output=False) + return TaskResult(name=node.name, exit_code=process.returncode, output_dir=output_dir) + except FileNotFoundError: + logger.error('Docker not found. Is Docker installed and in your PATH?') + return TaskResult(name=node.name, exit_code=127, output_dir=output_dir) + + def _build_token_map(self, node: TaskNode) -> Dict[str, str]: + """Build a mapping of {{token}} keys to container-side paths matching on-cluster layout.""" + tokens: Dict[str, str] = { + 'output': f'{CONTAINER_DATA_PATH}/output', + } + for index, input_source in enumerate(node.spec.inputs): + if isinstance(input_source, task_module.TaskInputOutput): + container_input_path = f'{CONTAINER_DATA_PATH}/input/{index}' + tokens[f'input:{input_source.task}'] = container_input_path + tokens[f'input:{index}'] = container_input_path + return tokens + + _UNRESOLVED_TOKEN_PATTERN = re.compile(r'\{\{[^}]+\}\}') + + def _substitute_tokens(self, text: str, tokens: Dict[str, str]) -> str: + """Replace all {{key}} placeholders in text with their corresponding token values.""" + for key, value in tokens.items(): + text = re.sub(r'\{\{\s*' + re.escape(key) + r'\s*\}\}', value, text) + return text + + def _check_unresolved_tokens(self, task_name: str, resolved_fields: List[str]): + """Raise ValueError if any resolved field still contains {{ }} placeholders.""" + unresolved: List[str] = [] + for field in resolved_fields: + for match in self._UNRESOLVED_TOKEN_PATTERN.finditer(field): + token = match.group(0) + if token not in unresolved: + unresolved.append(token) + if unresolved: + raise ValueError( + f'Task "{task_name}" has unresolved token(s): {", ".join(unresolved)}. ' + f'Use --set to provide values, or check for typos in template variable names.') + + +_OSMO_TOKEN_PATTERN = re.compile(r'\{\{\s*(uuid|workflow_id|output|input:[^}]+|host:[^}]+)\s*\}\}') + + +def _expand_jinja_locally(spec_text: str, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None) -> str: + """Expand Jinja templates in a workflow spec using its default-values section and CLI overrides. + + Mirrors the server-side logic in TemplateSpec.load_template_with_variables but runs + entirely locally: no PostgreSQL, no sandboxed worker pool. OSMO-specific tokens + ({{output}}, {{input:...}}, {{host:...}}, {{uuid}}, {{workflow_id}}) are protected + from expansion and restored afterward. + """ + file_text, default_values = workflow_utils.parse_workflow_spec(spec_text) + template_data: Dict[str, Any] = {} + if default_values: + template_data = default_values + + for data in (set_variables or []): + if '=' not in data: + raise ValueError(f'--set value "{data}" is incorrectly formatted (expected key=value)') + key, raw_value = data.split('=', 1) + try: + template_data[key] = int(raw_value) + except ValueError: + try: + template_data[key] = float(raw_value) + except ValueError: + template_data[key] = raw_value + + for data in (set_string_variables or []): + if '=' not in data: + raise ValueError( + f'--set-string value "{data}" is incorrectly formatted (expected key=value)') + key, raw_value = data.split('=', 1) + template_data[key] = raw_value + + placeholder_map: Dict[str, str] = {} + for match in _OSMO_TOKEN_PATTERN.finditer(file_text): + field = match.group(1).strip() + hash_key = 'hash' + str(int(hashlib.sha256(field.encode('utf-8')).hexdigest(), 16)) + original_token = match.group(0) + template_data[hash_key] = original_token + placeholder_map[original_token] = hash_key + + protected_text = file_text + for original_token, hash_key in placeholder_map.items(): + protected_text = protected_text.replace(original_token, '{{' + hash_key + '}}') + + jinja_env = jinja2.sandbox.SandboxedEnvironment(undefined=jinja2.StrictUndefined) + template = jinja_env.from_string(protected_text) + return template.render(template_data) + + +def _spec_has_templates(spec_text: str) -> bool: + """Return True if the spec contains Jinja template markers that need expansion.""" + return any(marker in spec_text for marker in ('{{', '{%', '{#', 'default-values')) + + +def run_workflow_standalone(spec_path: str, work_dir: str | None = None, + keep_work_dir: bool = False, + resume: bool = False, + from_step: str | None = None, + docker_cmd: str = 'docker', + shm_size: str | None = None, + set_variables: List[str] | None = None, + set_string_variables: List[str] | None = None, + credentials: Dict[str, str] | None = None) -> bool: + """Load a workflow spec from disk and execute it in standalone mode via Docker, managing the work directory lifecycle.""" + if (resume or from_step) and work_dir is None: + raise ValueError( + '--resume and --from-step require --work-dir pointing to a previous run directory.') + + with open(spec_path, encoding='utf-8') as f: + spec_text = f.read() + + if _spec_has_templates(spec_text): + logger.info('Spec contains Jinja templates — expanding locally') + spec_text = _expand_jinja_locally(spec_text, set_variables, set_string_variables) + + created_work_dir = work_dir is None + if work_dir is None: + work_dir = tempfile.mkdtemp(prefix='osmo-standalone-') + logger.info('Using temporary work directory: %s', work_dir) + + success = False + try: + executor = StandaloneExecutor(work_dir=work_dir, keep_work_dir=keep_work_dir, + docker_cmd=docker_cmd, shm_size=shm_size, + credentials=credentials) + spec = executor.load_spec(spec_text) + success = executor.execute(spec, resume=resume or from_step is not None, + from_step=from_step) + finally: + if created_work_dir and not keep_work_dir and success: + shutil.rmtree(work_dir, ignore_errors=True) + elif not success: + logger.info('Work directory preserved for debugging: %s', work_dir) + + return success diff --git a/src/utils/tests/BUILD b/src/utils/tests/BUILD index 78372b738..f6b8d375e 100644 --- a/src/utils/tests/BUILD +++ b/src/utils/tests/BUILD @@ -54,3 +54,25 @@ osmo_py_test( requirement("truststore"), ] ) + +py_test( + name = "test_standalone_executor", + srcs = ["test_standalone_executor.py"], + deps = [ + "//src/utils:standalone_executor", + ], + data = [ + "//cookbook/tutorials:tutorial_specs", + ], + local = True, +) + +py_test( + name = "test_compose_executor", + srcs = ["test_compose_executor.py"], + deps = [ + "//src/utils:compose_executor", + "//src/utils:standalone_executor", + ], + local = True, +) diff --git a/src/utils/tests/test_compose_executor.py b/src/utils/tests/test_compose_executor.py new file mode 100644 index 000000000..f70ea8129 --- /dev/null +++ b/src/utils/tests/test_compose_executor.py @@ -0,0 +1,1090 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import json +import os +import shutil +import subprocess +import tempfile +import textwrap +import unittest +from unittest import mock + +import yaml + +from src.utils.compose_executor import ( + COMPOSE_FILE_NAME, + ComposeExecutor, + run_workflow_compose, +) +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + TaskResult, + _expand_jinja_locally, + _spec_has_templates, +) + + +def _docker_compose_available() -> bool: + """Return True if Docker Compose V2 is available.""" + try: + result = subprocess.run( + ['docker', 'compose', 'version'], + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +DOCKER_COMPOSE_AVAILABLE = _docker_compose_available() +SKIP_COMPOSE_MSG = 'Docker Compose is not available on this machine' + + +# ============================================================================ +# Unit tests — no Docker required +# ============================================================================ + + +class TestComposeFileGeneration(unittest.TestCase): + """Verify that the generated docker-compose.yml matches the workflow spec.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-test-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _make_executor(self) -> ComposeExecutor: + return ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + + def _generate_and_load(self, spec_text: str) -> dict: + """Parse spec, build DAG, generate compose file, return parsed YAML.""" + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + executor._setup_directories() + executor._write_inline_files(spec) + executor._generate_compose_file(spec) + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + with open(compose_path, encoding='utf-8') as f: + return yaml.safe_load(f) + + def test_single_task_generates_one_service(self): + """A single-task workflow produces a compose file with one service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello + tasks: + - name: greet + image: alpine:3.18 + command: ["echo", "hello"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertIn('greet', compose['services']) + self.assertEqual(len(compose['services']), 1) + svc = compose['services']['greet'] + self.assertEqual(svc['image'], 'alpine:3.18') + self.assertEqual(svc['entrypoint'], ['echo']) + self.assertEqual(svc['command'], ['hello']) + + def test_parallel_tasks_generate_separate_services(self): + """Independent tasks produce separate services with no depends_on.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: task-a + image: alpine:3.18 + command: ["echo", "a"] + - name: task-b + image: alpine:3.18 + command: ["echo", "b"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertEqual(len(compose['services']), 2) + self.assertIn('task-a', compose['services']) + self.assertIn('task-b', compose['services']) + for svc in compose['services'].values(): + self.assertNotIn('depends_on', svc) + + def test_volumes_for_output(self): + """Each service has an output volume mapping to the host work directory.""" + spec_text = textwrap.dedent('''\ + workflow: + name: vol-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + output_volume = f'{os.path.abspath(os.path.join(self.work_dir, "task", "output"))}:{CONTAINER_DATA_PATH}/output' + self.assertIn(output_volume, svc['volumes']) + + def test_upstream_input_volumes(self): + """A consumer task mounts its upstream task's output as a read-only input.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + compose = self._generate_and_load(spec_text) + + consumer = compose['services']['consumer'] + upstream_output = os.path.abspath( + os.path.join(self.work_dir, 'producer', 'output')) + expected_volume = f'{upstream_output}:{CONTAINER_DATA_PATH}/input/0:ro' + self.assertIn(expected_volume, consumer['volumes']) + + def test_environment_variables_included(self): + """Environment variables from the spec appear in the compose service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: task + image: alpine:3.18 + command: ["printenv"] + environment: + FOO: bar + BAZ: "42" + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + self.assertEqual(svc['environment']['FOO'], 'bar') + self.assertEqual(svc['environment']['BAZ'], '42') + + def test_inline_files_mounted(self): + """Inline files are written to disk and bind-mounted into the service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: files-test + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: echo hello + path: /tmp/run.sh + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + file_volumes = [v for v in svc['volumes'] if '/tmp/run.sh:ro' in v] + self.assertEqual(len(file_volumes), 1) + + host_path = file_volumes[0].split(':')[0] + self.assertTrue(os.path.exists(host_path)) + with open(host_path, encoding='utf-8') as f: + self.assertEqual(f.read(), 'echo hello') + + def test_group_network_assigned(self): + """Tasks in a group share a compose network named after the group.""" + spec_text = textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + + self.assertIn('workers', compose.get('networks', {})) + self.assertEqual(compose['services']['leader']['networks'], ['workers']) + self.assertEqual(compose['services']['follower']['networks'], ['workers']) + + def test_gpu_resources_in_compose(self): + """GPU tasks get deploy.resources.reservations.devices and shm_size.""" + spec_text = textwrap.dedent('''\ + workflow: + name: gpu-test + resources: + gpu-res: + gpu: 2 + tasks: + - name: train + image: pytorch:latest + resource: gpu-res + command: ["python", "train.py"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['train'] + devices = svc['deploy']['resources']['reservations']['devices'] + self.assertEqual(len(devices), 1) + self.assertEqual(devices[0]['driver'], 'nvidia') + self.assertEqual(devices[0]['count'], 2) + self.assertIn('gpu', devices[0]['capabilities']) + self.assertEqual(svc['shm_size'], '16g') + + def test_custom_shm_size(self): + """A user-specified shm_size overrides the default for GPU tasks.""" + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-res: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-res + command: ["python"] + ''') + executor = ComposeExecutor( + work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + executor._setup_directories() + executor._generate_compose_file(spec) + + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + with open(compose_path, encoding='utf-8') as f: + compose = yaml.safe_load(f) + self.assertEqual(compose['services']['train']['shm_size'], '32g') + + def test_non_gpu_task_no_deploy_section(self): + """A CPU-only task has no deploy section in the compose service.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cpu-test + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo"] + ''') + compose = self._generate_and_load(spec_text) + self.assertNotIn('deploy', compose['services']['preprocess']) + + def test_entrypoint_and_command_split(self): + """The task command is split into entrypoint (first element) and command (rest + args).""" + spec_text = textwrap.dedent('''\ + workflow: + name: split-test + tasks: + - name: task + image: alpine:3.18 + command: ["bash", "-c"] + args: ["echo hello"] + ''') + compose = self._generate_and_load(spec_text) + + svc = compose['services']['task'] + self.assertEqual(svc['entrypoint'], ['bash']) + self.assertEqual(svc['command'], ['-c', 'echo hello']) + + +class TestComposeTokenMap(unittest.TestCase): + """Verify that the token map includes {{host:taskname}} for same-group tasks.""" + + def test_host_tokens_for_group_members(self): + """Tasks in the same group get host tokens for all group members.""" + spec_text = textwrap.dedent('''\ + workflow: + name: host-tokens + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: worker-a + image: alpine:3.18 + command: ["echo"] + - name: worker-b + image: alpine:3.18 + command: ["echo"] + ''') + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + leader_node = executor._task_nodes['leader'] + tokens = executor._build_token_map(leader_node) + + self.assertEqual(tokens['host:leader'], 'leader') + self.assertEqual(tokens['host:worker-a'], 'worker-a') + self.assertEqual(tokens['host:worker-b'], 'worker-b') + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') + + def test_no_host_tokens_for_single_task_group(self): + """A single-task group still gets a host token for itself.""" + spec_text = textwrap.dedent('''\ + workflow: + name: single + tasks: + - name: solo + image: alpine:3.18 + command: ["echo"] + ''') + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + node = executor._task_nodes['solo'] + tokens = executor._build_token_map(node) + self.assertIn('host:solo', tokens) + + +class TestComposeValidation(unittest.TestCase): + """Verify compose-mode validation accepts host tokens but rejects cluster features.""" + + def _make_executor(self) -> ComposeExecutor: + return ComposeExecutor(work_dir='/tmp/unused') + + def test_host_tokens_accepted(self): + """Specs with {{host:taskname}} tokens pass compose validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: host-ok + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + args: ["--peer={{host:follower}}"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + + def test_host_token_cross_group_rejected(self): + """A {{host:taskname}} that references a task in another group is rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cross-group + groups: + - name: group-a + tasks: + - name: task-a + lead: true + image: alpine:3.18 + command: ["echo"] + args: ["--peer={{host:task-b}}"] + - name: group-b + tasks: + - name: task-b + lead: true + image: alpine:3.18 + command: ["echo"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('host:task-b', str(context.exception)) + self.assertIn('outside its group', str(context.exception)) + + def test_dataset_input_rejected(self): + """Dataset inputs are still rejected in compose mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('dataset', str(context.exception)) + + def test_credential_not_provided_rejected(self): + """A credential required by a task but not supplied via --credential is rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_compose(spec) + self.assertIn('credential', str(context.exception)) + + def test_provided_credential_passes(self): + """A credential supplied via --credential is accepted in compose mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + ''') + with tempfile.TemporaryDirectory() as tmp_dir: + secret_dir = os.path.join(tmp_dir, 'secret-dir') + os.makedirs(secret_dir) + executor = ComposeExecutor( + work_dir=tmp_dir, credentials={'my-secret': secret_dir}) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + + def test_simple_spec_passes(self): + """A simple spec with only task-to-task inputs passes compose validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_compose(spec) + + +class TestFindReadyWave(unittest.TestCase): + """Verify the group-aware wave scheduling logic.""" + + def _make_executor(self, spec_text: str) -> ComposeExecutor: + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + return executor + + def test_all_independent_tasks_in_one_wave(self): + """All independent tasks appear in the first wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + - name: c + image: alpine:3.18 + command: ["echo"] + ''')) + wave = executor._find_ready_wave() + self.assertEqual(set(wave), {'a', 'b', 'c'}) + + def test_serial_chain_one_per_wave(self): + """A serial chain yields one task per wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''')) + + wave1 = executor._find_ready_wave() + self.assertEqual(wave1, ['first']) + + executor._results['first'] = TaskResult( + name='first', exit_code=0, output_dir='/tmp/out') + wave2 = executor._find_ready_wave() + self.assertEqual(wave2, ['second']) + + def test_multi_task_group_co_scheduled(self): + """All tasks in a multi-task group appear in the same wave.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo"] + - name: follower + image: alpine:3.18 + command: ["echo"] + ''')) + wave = executor._find_ready_wave() + self.assertEqual(set(wave), {'leader', 'follower'}) + + def test_diamond_dag_waves(self): + """A diamond DAG produces three waves: root, fan-out, fan-in.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''')) + + wave1 = executor._find_ready_wave() + self.assertEqual(wave1, ['root']) + + executor._results['root'] = TaskResult( + name='root', exit_code=0, output_dir='/tmp/out') + wave2 = executor._find_ready_wave() + self.assertEqual(set(wave2), {'left', 'right'}) + + executor._results['left'] = TaskResult( + name='left', exit_code=0, output_dir='/tmp/out') + executor._results['right'] = TaskResult( + name='right', exit_code=0, output_dir='/tmp/out') + wave3 = executor._find_ready_wave() + self.assertEqual(wave3, ['join']) + + def test_empty_wave_when_all_done(self): + """An empty wave is returned when all tasks have completed.""" + executor = self._make_executor(textwrap.dedent('''\ + workflow: + name: done + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''')) + executor._results['task'] = TaskResult( + name='task', exit_code=0, output_dir='/tmp/out') + wave = executor._find_ready_wave() + self.assertEqual(wave, []) + + +class TestComposeProjectName(unittest.TestCase): + """Verify the Docker Compose project name generation.""" + + def test_simple_name(self): + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(textwrap.dedent('''\ + workflow: + name: my-workflow + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + ''')) + self.assertEqual(executor._compose_project_name(spec), 'osmo-my-workflow') + + def test_name_with_special_chars(self): + executor = ComposeExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(textwrap.dedent('''\ + workflow: + name: my-workflow + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + ''')) + project = executor._compose_project_name(spec) + self.assertTrue(project.startswith('osmo-')) + self.assertRegex(project, r'^[a-z0-9-]+$') + + +class TestJinjaTemplateDetection(unittest.TestCase): + """Verify that Jinja templates are detected and expanded locally before execution.""" + + def test_jinja_block_expanded(self): + """A spec with {% %} Jinja block tags is detected and expanded locally.""" + spec_text = textwrap.dedent('''\ + workflow: + name: {% if true %}test{% endif %} + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{%', expanded) + self.assertIn('name: test', expanded) + + def test_default_values_expanded(self): + """A spec with a default-values section has its {{ }} variables expanded locally.""" + spec_text = textwrap.dedent('''\ + workflow: + name: "{{experiment}}" + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + default-values: + experiment: test + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('test', expanded) + + +class TestUnresolvedTokenDetection(unittest.TestCase): + """Verify that unresolved tokens are caught during compose file generation.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-tokens-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_unresolved_jinja_variable_caught(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "{{missing_var}}"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('missing_var', str(context.exception)) + + +class TestPathTraversal(unittest.TestCase): + """Verify that file path traversal is prevented.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-traversal-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_path_traversal_rejected(self): + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "malicious" + path: /../../etc/evil.conf + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('escapes the task directory', str(context.exception)) + + +class TestRunWorkflowComposeErrors(unittest.TestCase): + """Test error handling in run_workflow_compose().""" + + def test_nonexistent_file_raises(self): + with self.assertRaises(FileNotFoundError): + run_workflow_compose(spec_path='/nonexistent/path/spec.yaml') + + +# ============================================================================ +# Integration tests — require Docker Compose +# ============================================================================ + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestComposeExecution(unittest.TestCase): + """Integration tests that run workflows through Docker Compose.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-test-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _execute_spec(self, spec_text: str) -> bool: + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + return executor.execute(spec) + + def test_hello_world(self): + """Run a minimal single-task workflow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello-compose + tasks: + - name: hello + image: alpine:3.18 + command: ["echo", "Hello from Docker Compose!"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_parallel_independent_tasks(self): + """Independent tasks all execute and produce their outputs.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel-compose + tasks: + - name: task-a + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'a' > {{output}}/marker.txt"] + - name: task-b + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'b' > {{output}}/marker.txt"] + - name: task-c + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'c' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + for task_name, expected in [('task-a', 'a'), ('task-b', 'b'), ('task-c', 'c')]: + marker = os.path.join(self.work_dir, task_name, 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), expected) + + def test_serial_data_flow(self): + """Data written by a producer is readable by a consumer.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-compose + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'from_producer' > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'from_producer') + + def test_diamond_dag(self): + """A diamond DAG executes with correct data flow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond-compose + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'root_data' > {{output}}/base.txt"] + - name: left + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'left:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'right:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/result.txt > {{output}}/final.txt && cat {{input:1}}/result.txt >> {{output}}/final.txt"] + inputs: + - task: left + - task: right + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'join', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('left:', content) + self.assertIn('right:', content) + self.assertIn('root_data', content) + + def test_failure_cancels_downstream(self): + """A failed task prevents downstream dependents from running.""" + spec_text = textwrap.dedent('''\ + workflow: + name: fail-compose + tasks: + - name: failing + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: should-not-run + image: alpine:3.18 + command: ["sh", "-c", "echo oops > {{output}}/bad.txt"] + inputs: + - task: failing + ''') + self.assertFalse(self._execute_spec(spec_text)) + output_file = os.path.join( + self.work_dir, 'should-not-run', 'output', 'bad.txt') + self.assertFalse(os.path.exists(output_file)) + + def test_environment_variables(self): + """Environment variables are passed to compose containers.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-compose + tasks: + - name: check-env + image: alpine:3.18 + command: ["sh", "-c"] + args: ["test \\"$MY_VAR\\" = \\"hello\\" && echo ok > {{output}}/result.txt"] + environment: + MY_VAR: hello + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_inline_file_mounted(self): + """An inline file is written and mounted into the container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: files-compose + tasks: + - name: check-file + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "script ran" > {{output}}/result.txt + path: /tmp/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + result = os.path.join(self.work_dir, 'check-file', 'output', 'result.txt') + with open(result) as f: + self.assertIn('script ran', f.read()) + + def test_compose_file_preserved(self): + """The generated docker-compose.yml is kept in the work directory.""" + spec_text = textwrap.dedent('''\ + workflow: + name: preserve-compose + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + self._execute_spec(spec_text) + compose_path = os.path.join(self.work_dir, COMPOSE_FILE_NAME) + self.assertTrue(os.path.exists(compose_path)) + + def test_groups_with_data_flow(self): + """Groups with inter-group data dependencies execute correctly.""" + spec_text = textwrap.dedent('''\ + workflow: + name: group-flow-compose + groups: + - name: prepare + tasks: + - name: generate + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + mkdir -p {{output}}/data + for i in 1 2 3; do echo "sample_$i" >> {{output}}/data/dataset.csv; done + - name: train + tasks: + - name: trainer + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + wc -l {{input:0}}/data/dataset.csv > {{output}}/count.txt + inputs: + - task: generate + ''') + self.assertTrue(self._execute_spec(spec_text)) + count_file = os.path.join(self.work_dir, 'trainer', 'output', 'count.txt') + with open(count_file) as f: + self.assertIn('3', f.read()) + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestComposeLeadTaskPolicy(unittest.TestCase): + """Verify ignoreNonleadStatus behavior in compose mode.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-lead-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_nonlead_failure_ignored_when_flag_true(self): + """With ignoreNonleadStatus=true, a non-lead failure does not abort the workflow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy-compose + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + def test_lead_failure_aborts_workflow(self): + """A lead task failure aborts the workflow even with ignoreNonleadStatus=true.""" + spec_text = textwrap.dedent('''\ + workflow: + name: lead-fail-compose + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: follower + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = ComposeExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + +@unittest.skipUnless(DOCKER_COMPOSE_AVAILABLE, SKIP_COMPOSE_MSG) +class TestRunWorkflowCompose(unittest.TestCase): + """Test the top-level run_workflow_compose() function.""" + + def setUp(self): + self.work_dir = tempfile.mkdtemp(prefix='osmo-compose-func-') + + def tearDown(self): + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_success_with_work_dir(self): + """A successful run preserves the caller-supplied work directory.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: func-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_compose( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertTrue(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + def test_failure_preserves_work_dir(self): + """On failure, the work directory is preserved.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: fail-func + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''')) + spec_path = f.name + try: + result = run_workflow_compose( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=False, + ) + self.assertFalse(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/utils/tests/test_standalone_executor.py b/src/utils/tests/test_standalone_executor.py new file mode 100644 index 000000000..debf61568 --- /dev/null +++ b/src/utils/tests/test_standalone_executor.py @@ -0,0 +1,2248 @@ +""" +SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # pylint: disable=line-too-long + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +SPDX-License-Identifier: Apache-2.0 +""" + +import os +import shutil +import subprocess +import tempfile +import textwrap +import unittest +from unittest import mock + +from src.utils.job import task as task_module +from src.utils.standalone_executor import ( + CONTAINER_DATA_PATH, + StandaloneExecutor, + TaskNode, + TaskResult, + _expand_jinja_locally, + _spec_has_templates, + run_workflow_standalone, +) + + +# --------------------------------------------------------------------------- +# Helper: detect Docker availability once for the entire module +# --------------------------------------------------------------------------- +def _docker_available() -> bool: + """Return True if the Docker daemon is reachable via 'docker info', False otherwise.""" + try: + result = subprocess.run( + ['docker', 'info'], + capture_output=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +DOCKER_AVAILABLE = _docker_available() +SKIP_DOCKER_MSG = 'Docker is not available on this machine' + + +# ============================================================================ +# Unit tests — no Docker required; exercise parsing, DAG, tokens, validation +# ============================================================================ +class TestLoadSpec(unittest.TestCase): + """Verify that real OSMO YAML specs are parsed correctly via the existing Pydantic models.""" + + def test_single_task_spec(self): + """Parse a minimal single-task workflow and verify name, task count, and image.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello-osmo + tasks: + - name: hello + image: ubuntu:24.04 + command: ["echo"] + args: ["Hello from OSMO!"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'hello-osmo') + self.assertEqual(len(spec.tasks), 1) + self.assertEqual(spec.tasks[0].name, 'hello') + self.assertEqual(spec.tasks[0].image, 'ubuntu:24.04') + + def test_serial_tasks_spec(self): + """Parse a two-task serial workflow and verify the task input dependency is resolved.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-tasks + tasks: + - name: task1 + image: ubuntu:22.04 + command: [sh] + args: [/tmp/run.sh] + files: + - contents: | + echo "Hello from task1" + echo "data" > {{output}}/test.txt + path: /tmp/run.sh + - name: task2 + image: ubuntu:22.04 + command: [sh] + args: [/tmp/run.sh] + files: + - contents: | + cat {{input:0}}/test.txt + path: /tmp/run.sh + inputs: + - task: task1 + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'serial-tasks') + self.assertEqual(len(spec.tasks), 2) + first_input = spec.tasks[1].inputs[0] + self.assertIsInstance(first_input, task_module.TaskInputOutput) + if isinstance(first_input, task_module.TaskInputOutput): + self.assertEqual(first_input.task, 'task1') + + def test_groups_spec(self): + """Parse a grouped workflow and verify group structure and the lead task flag.""" + spec_text = textwrap.dedent('''\ + workflow: + name: grouped + groups: + - name: first-group + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo", "leader"] + - name: follower + image: ubuntu:24.04 + command: ["echo", "follower"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(len(spec.groups), 1) + self.assertEqual(len(spec.groups[0].tasks), 2) + self.assertTrue(spec.groups[0].tasks[0].lead) + + def test_versioned_spec(self): + """Parse a spec with an explicit version field and verify it loads correctly.""" + spec_text = textwrap.dedent('''\ + version: 2 + workflow: + name: versioned + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.name, 'versioned') + + def test_invalid_version_rejected(self): + """Reject a spec with an unsupported version number.""" + spec_text = textwrap.dedent('''\ + version: 99 + workflow: + name: bad-version + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + with self.assertRaises(ValueError): + executor.load_spec(spec_text) + + def test_both_tasks_and_groups_rejected(self): + """Reject a spec that defines both top-level tasks and groups simultaneously.""" + spec_text = textwrap.dedent('''\ + workflow: + name: invalid + tasks: + - name: t + image: alpine:3.18 + command: ["echo"] + groups: + - name: g + tasks: + - name: t2 + image: alpine:3.18 + command: ["echo"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + with self.assertRaises(ValueError): + executor.load_spec(spec_text) + + def test_empty_workflow_rejected(self): + """Reject a spec with no tasks or groups defined.""" + spec_text = textwrap.dedent('''\ + workflow: + name: empty + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + with self.assertRaises(ValueError): + executor.load_spec(spec_text) + + def test_resources_spec_parsed(self): + """Parse a spec with resource definitions and verify cpu/memory values.""" + spec_text = textwrap.dedent('''\ + workflow: + name: with-resources + resources: + default: + cpu: 2 + memory: 4Gi + storage: 10Gi + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.resources['default'].cpu, 2) + self.assertEqual(spec.resources['default'].memory, '4Gi') + + def test_environment_parsed(self): + """Parse a spec with environment variables and verify key-value pairs are preserved.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: task + image: alpine:3.18 + command: ["printenv"] + environment: + MY_VAR: hello + ANOTHER: world + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + self.assertEqual(spec.tasks[0].environment['MY_VAR'], 'hello') + self.assertEqual(spec.tasks[0].environment['ANOTHER'], 'world') + + +class TestBuildDag(unittest.TestCase): + """Verify DAG construction from task dependencies.""" + + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for DAG-only tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') + + def test_no_dependencies(self): + """All tasks with no input dependencies have empty upstream and downstream sets.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo", "a"] + - name: b + image: alpine:3.18 + command: ["echo", "b"] + - name: c + image: alpine:3.18 + command: ["echo", "c"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(len(executor._task_nodes), 3) + for node in executor._task_nodes.values(): + self.assertEqual(len(node.upstream), 0) + self.assertEqual(len(node.downstream), 0) + + def test_serial_chain(self): + """A three-task chain produces correct upstream/downstream links at each step.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + - name: third + image: alpine:3.18 + command: ["echo"] + inputs: + - task: second + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['first'].upstream, set()) + self.assertEqual(executor._task_nodes['first'].downstream, {'second'}) + self.assertEqual(executor._task_nodes['second'].upstream, {'first'}) + self.assertEqual(executor._task_nodes['second'].downstream, {'third'}) + self.assertEqual(executor._task_nodes['third'].upstream, {'second'}) + self.assertEqual(executor._task_nodes['third'].downstream, set()) + + def test_diamond_dependency(self): + """A diamond DAG (root -> left/right -> join) wires fan-out and fan-in edges correctly.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['root'].downstream, {'left', 'right'}) + self.assertEqual(executor._task_nodes['join'].upstream, {'left', 'right'}) + + def test_unknown_dependency_raises(self): + """Referencing a non-existent upstream task raises ValueError.""" + spec_text = textwrap.dedent('''\ + workflow: + name: broken + tasks: + - name: task1 + image: alpine:3.18 + command: ["echo"] + inputs: + - task: nonexistent + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + self.assertIn('nonexistent', str(context.exception)) + + def test_groups_with_cross_group_deps(self): + """Dependencies between tasks in different groups are wired correctly.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cross-group + groups: + - name: fetch + tasks: + - name: download + lead: true + image: alpine:3.18 + command: ["echo"] + - name: process + tasks: + - name: transform + lead: true + image: alpine:3.18 + command: ["echo"] + inputs: + - task: download + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + self.assertEqual(executor._task_nodes['download'].downstream, {'transform'}) + self.assertEqual(executor._task_nodes['transform'].upstream, {'download'}) + + +class TestCycleDetection(unittest.TestCase): + """Verify that circular dependencies are detected and reported during DAG construction.""" + + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for cycle-detection tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') + + def test_direct_cycle_two_tasks(self): + """Two tasks that depend on each other form a direct cycle and are rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cycle + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + error_message = str(context.exception) + self.assertIn('Circular dependency', error_message) + self.assertIn('a', error_message) + self.assertIn('b', error_message) + + def test_indirect_cycle_three_tasks(self): + """Three tasks forming a cycle (a -> b -> c -> a) are rejected.""" + spec_text = textwrap.dedent('''\ + workflow: + name: cycle + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: c + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + self.assertIn('Circular dependency', str(context.exception)) + + def test_cycle_in_subgraph_with_valid_root(self): + """A cycle in a subgraph is detected even when other tasks have no cycle.""" + spec_text = textwrap.dedent('''\ + workflow: + name: partial-cycle + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: a + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - task: b + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor._build_dag(spec) + error_message = str(context.exception) + self.assertIn('Circular dependency', error_message) + self.assertIn('a', error_message) + self.assertIn('b', error_message) + + def test_no_cycle_linear_chain(self): + """A linear chain (a -> b -> c) has no cycle and is accepted.""" + spec_text = textwrap.dedent('''\ + workflow: + name: linear + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + def test_no_cycle_diamond(self): + """A diamond DAG (root -> left/right -> join) has no cycle and is accepted.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["echo"] + - name: left + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["echo"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["echo"] + inputs: + - task: left + - task: right + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + +class TestFindReadyTasks(unittest.TestCase): + """Verify correct identification of tasks ready to execute.""" + + def test_all_root_tasks_ready(self): + """Tasks with no upstream dependencies are immediately ready.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + ready = executor._find_ready_tasks() + self.assertEqual(set(ready), {'a', 'b'}) + + def test_dependent_not_ready_until_upstream_completes(self): + """A downstream task only becomes ready after its upstream dependency completes.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + ready = executor._find_ready_tasks() + self.assertEqual(ready, ['first']) + + executor._results['first'] = TaskResult(name='first', exit_code=0, output_dir='/tmp/out') + ready = executor._find_ready_tasks() + self.assertEqual(ready, ['second']) + + def test_failed_upstream_blocks_downstream(self): + """A failed upstream task prevents its downstream dependents from becoming ready.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: first + image: alpine:3.18 + command: ["echo"] + - name: second + image: alpine:3.18 + command: ["echo"] + inputs: + - task: first + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['first'] = TaskResult(name='first', exit_code=1, output_dir='/tmp/out') + ready = executor._find_ready_tasks() + self.assertEqual(ready, []) + + +class TestCancelDownstream(unittest.TestCase): + """Verify that downstream tasks are cancelled when an upstream task fails.""" + + def test_cascading_cancel(self): + """Cancellation of a failed task propagates to all transitive downstream dependents.""" + spec_text = textwrap.dedent('''\ + workflow: + name: chain + tasks: + - name: a + image: alpine:3.18 + command: ["echo"] + - name: b + image: alpine:3.18 + command: ["echo"] + inputs: + - task: a + - name: c + image: alpine:3.18 + command: ["echo"] + inputs: + - task: b + ''') + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['a'] = TaskResult(name='a', exit_code=1, output_dir='/tmp') + executor._cancel_downstream('a') + + self.assertIn('b', executor._results) + self.assertIn('c', executor._results) + self.assertEqual(executor._results['b'].exit_code, -1) + self.assertEqual(executor._results['c'].exit_code, -1) + + +class TestSubstituteTokens(unittest.TestCase): + """Verify {{token}} placeholder replacement in command strings and file contents.""" + + def test_output_token(self): + """The {{output}} token is replaced with the task output directory path.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + tokens = {'output': '/work/task1/output'} + result = executor._substitute_tokens('echo data > {{output}}/file.txt', tokens) + self.assertEqual(result, 'echo data > /work/task1/output/file.txt') + + def test_input_by_index(self): + """The {{input:N}} token is replaced with the Nth upstream output directory.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + tokens = {'input:0': '/work/upstream/output'} + result = executor._substitute_tokens('cat {{input:0}}/data.csv', tokens) + self.assertEqual(result, 'cat /work/upstream/output/data.csv') + + def test_input_by_name(self): + """The {{input:taskname}} token is replaced with the named task's output directory.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + tokens = {'input:task1': '/work/task1/output'} + result = executor._substitute_tokens('cat {{ input:task1 }}/data.csv', tokens) + self.assertEqual(result, 'cat /work/task1/output/data.csv') + + def test_whitespace_around_tokens(self): + """Whitespace inside {{ token }} braces is tolerated during substitution.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + tokens = {'output': '/out'} + result = executor._substitute_tokens('{{ output }}/file.txt', tokens) + self.assertEqual(result, '/out/file.txt') + + def test_multiple_tokens_in_one_string(self): + """Multiple distinct tokens in the same string are all replaced.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + tokens = {'output': '/out', 'input:0': '/in0'} + result = executor._substitute_tokens('cp {{input:0}}/src {{output}}/dst', tokens) + self.assertEqual(result, 'cp /in0/src /out/dst') + + def test_no_tokens_unchanged(self): + """Text without any token placeholders passes through unchanged.""" + executor = StandaloneExecutor(work_dir='/tmp/unused') + result = executor._substitute_tokens('plain text no tokens', {}) + self.assertEqual(result, 'plain text no tokens') + + +class TestBuildTokenMap(unittest.TestCase): + """Verify that token maps are built correctly from task DAG relationships.""" + + def test_output_only(self): + """A task with no inputs produces a token map containing only the output key.""" + spec_text = textwrap.dedent('''\ + workflow: + name: simple + tasks: + - name: task1 + image: alpine:3.18 + command: ["echo"] + ''') + executor = StandaloneExecutor(work_dir='/tmp/work') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + node = executor._task_nodes['task1'] + tokens = executor._build_token_map(node) + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') + self.assertEqual(len(tokens), 1) + + def test_with_upstream_inputs(self): + """A task with upstream inputs gets both index-based and name-based input tokens pointing to container paths.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = StandaloneExecutor(work_dir='/tmp/work') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + + executor._results['producer'] = TaskResult( + name='producer', exit_code=0, output_dir='/tmp/work/producer/output') + + node = executor._task_nodes['consumer'] + tokens = executor._build_token_map(node) + + self.assertEqual(tokens['output'], f'{CONTAINER_DATA_PATH}/output') + self.assertEqual(tokens['input:0'], f'{CONTAINER_DATA_PATH}/input/0') + self.assertEqual(tokens['input:producer'], f'{CONTAINER_DATA_PATH}/input/0') + + +class TestValidateForStandalone(unittest.TestCase): + """Verify that unsupported features are detected and rejected.""" + + def _make_executor(self) -> StandaloneExecutor: + """Create a StandaloneExecutor with a throwaway work directory for validation-only tests.""" + return StandaloneExecutor(work_dir='/tmp/unused') + + def test_simple_spec_passes(self): + """A spec using only task-to-task inputs passes standalone validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_standalone(spec) + + def test_dataset_input_rejected(self): + """A spec with dataset inputs is rejected as unsupported in standalone mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_standalone(spec) + self.assertIn('dataset', str(context.exception)) + + def test_url_input_rejected(self): + """A spec with URL inputs is rejected as unsupported in standalone mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + inputs: + - url: s3://my-bucket/data/ + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_standalone(spec) + self.assertIn('URL', str(context.exception)) + + def test_dataset_output_ignored(self): + """A spec with dataset outputs passes validation; the output is ignored in standalone mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + outputs: + - dataset: + name: my_dataset + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_standalone(spec) + + def test_url_output_rejected(self): + """A spec with URL outputs is rejected as unsupported in standalone mode.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + outputs: + - url: s3://my-bucket/models/ + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_standalone(spec) + self.assertIn('object storage', str(context.exception).lower()) + + def test_multiple_unsupported_features_all_reported(self): + """All unsupported features across multiple tasks are reported in a single error.""" + spec_text = textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task1 + image: ubuntu:24.04 + command: ["echo"] + inputs: + - url: s3://bucket/data/ + - name: task2 + image: ubuntu:24.04 + command: ["echo"] + inputs: + - dataset: + name: ds + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_standalone(spec) + error_message = str(context.exception) + self.assertIn('task1', error_message) + self.assertIn('task2', error_message) + + def test_task_deps_only_passes(self): + """A spec with only task-to-task dependencies passes standalone validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: producer + image: alpine:3.18 + command: ["echo"] + - name: consumer + image: alpine:3.18 + command: ["echo"] + inputs: + - task: producer + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_standalone(spec) + + def test_files_and_env_pass(self): + """A spec using files and environment variables passes standalone validation.""" + spec_text = textwrap.dedent('''\ + workflow: + name: ok + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + environment: + MY_VAR: hello + files: + - contents: echo hi + path: /tmp/run.sh + ''') + executor = self._make_executor() + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._validate_for_standalone(spec) + + +class TestValidateForStandaloneRemainingBranches(unittest.TestCase): + """Verify that _validate_for_standalone rejects credentials, checkpoint, volumeMounts, privileged, and hostNetwork.""" + + _UNSUPPORTED_SPECS = { + 'credentials': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + credentials: + my-secret: NGC_API_KEY + '''), + 'expected_substring': 'credential', + }, + 'checkpoint': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + checkpoint: + - path: /output/model + url: s3://bucket/checkpoints/ + frequency: 300 + '''), + 'expected_substring': 'checkpoint', + }, + 'volumeMounts': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + volumeMounts: + - "/data:/data:ro" + '''), + 'expected_substring': 'volumeMounts', + }, + 'privileged': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + privileged: true + '''), + 'expected_substring': 'privileged', + }, + 'hostNetwork': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + tasks: + - name: task + image: ubuntu:24.04 + command: ["echo"] + hostNetwork: true + '''), + 'expected_substring': 'hostNetwork', + }, + 'host_token_in_args': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo"] + args: ["--peer={{host:follower}}"] + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, + 'host_token_in_env': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["echo"] + environment: + PEER_HOST: "{{ host:follower }}" + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, + 'host_token_in_files': { + 'yaml': textwrap.dedent('''\ + workflow: + name: bad + groups: + - name: workers + tasks: + - name: leader + lead: true + image: ubuntu:24.04 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "connecting to {{host:follower}}" + path: /tmp/run.sh + - name: follower + image: ubuntu:24.04 + command: ["echo"] + '''), + 'expected_substring': 'host:taskname', + }, + } + + def test_unsupported_fields_rejected(self): + """Each unsupported task-level field is detected and rejected with a descriptive error.""" + for feature, case in self._UNSUPPORTED_SPECS.items(): + with self.subTest(feature=feature): + executor = StandaloneExecutor(work_dir='/tmp/unused') + spec = executor.load_spec(case['yaml']) + executor._build_dag(spec) + with self.assertRaises(ValueError) as context: + executor._validate_for_standalone(spec) + self.assertIn(case['expected_substring'], str(context.exception)) + + +class TestFilePathTraversal(unittest.TestCase): + """Verify that file paths cannot escape the task directory.""" + + def setUp(self): + """Create a temporary work directory.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-traversal-') + + def tearDown(self): + """Remove the temporary work directory.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_path_traversal_rejected(self, mock_run): + """A file spec with a path that escapes the task directory raises ValueError.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: traversal + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "malicious" + path: /../../etc/evil.conf + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['task'] + with self.assertRaises(ValueError) as context: + executor._run_task(node, spec) + self.assertIn('escapes the task directory', str(context.exception)) + + @mock.patch('subprocess.run') + def test_safe_nested_path_accepted(self, mock_run): + """A file spec with a safe nested path is accepted without error.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: safe + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + files: + - contents: "safe" + path: /tmp/scripts/run.sh + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['task'] + executor._run_task(node, spec) + mock_run.assert_called_once() + + +class TestLeadTaskFailurePolicy(unittest.TestCase): + """Verify ignoreNonleadStatus behavior: non-lead failures are tolerated when the flag is true.""" + + def setUp(self): + """Create a temporary work directory for lead-task policy tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-lead-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_nonlead_failure_ignored_when_flag_true(self, mock_run): + """With ignoreNonleadStatus=true (default), a non-lead failure does not abort the workflow.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_lead_failure_aborts_workflow(self, mock_run): + """Even with ignoreNonleadStatus=true, a lead task failure aborts the workflow.""" + mock_run.return_value = mock.Mock(returncode=1) + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: follower + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_nonlead_failure_aborts_when_flag_false(self, mock_run): + """With ignoreNonleadStatus=false, a non-lead failure aborts the workflow.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: lead-policy + groups: + - name: workers + ignoreNonleadStatus: false + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + @mock.patch('subprocess.run') + def test_nonlead_failure_does_not_block_downstream_group(self, mock_run): + """A tolerated non-lead failure does not prevent a downstream group from running.""" + mock_run.side_effect = [ + mock.Mock(returncode=0), + mock.Mock(returncode=1), + mock.Mock(returncode=0), + ] + spec_text = textwrap.dedent('''\ + workflow: + name: downstream-after-nonlead-fail + groups: + - name: first + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + - name: follower + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: second + tasks: + - name: consumer + lead: true + image: alpine:3.18 + command: ["echo", "ok"] + inputs: + - task: leader + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + self.assertEqual(mock_run.call_count, 3) + + @mock.patch('subprocess.run') + def test_single_task_group_failure_aborts(self, mock_run): + """A single-task group (auto-promoted to lead) aborts on failure like normal.""" + mock_run.return_value = mock.Mock(returncode=1) + spec_text = textwrap.dedent('''\ + workflow: + name: single-fail + tasks: + - name: only-task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + +class TestUnresolvedTokenDetection(unittest.TestCase): + """Verify that unresolved {{variable}} tokens are detected before running containers.""" + + def setUp(self): + """Create a temporary work directory for unresolved token tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-tokens-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_jinja_variable_in_args_detected(self): + """A bare {{variable}} in args (without default-values section) is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["echo"] + args: ["{{experiment_name}}"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('unresolved', str(context.exception).lower()) + self.assertIn('experiment_name', str(context.exception)) + + def test_jinja_variable_in_command_detected(self): + """A bare {{variable}} in command is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["{{my_binary}}"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('my_binary', str(context.exception)) + + def test_jinja_variable_in_env_detected(self): + """A bare {{variable}} in environment values is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["echo"] + environment: + MY_VAR: "{{some_value}}" + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('some_value', str(context.exception)) + + def test_jinja_variable_in_file_contents_detected(self): + """A bare {{variable}} in file contents is caught before execution.""" + spec_text = textwrap.dedent('''\ + workflow: + name: jinja-leak + tasks: + - name: task + image: "alpine:3.18" + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo {{config_path}}/data + path: /tmp/run.sh + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('config_path', str(context.exception)) + + def test_typo_in_osmo_token_detected(self): + """A typo in an OSMO token (e.g., {{ouptut}}) is caught as unresolved.""" + spec_text = textwrap.dedent('''\ + workflow: + name: typo + tasks: + - name: task + image: "alpine:3.18" + command: ["sh", "-c"] + args: ["echo data > {{ouptut}}/file.txt"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('ouptut', str(context.exception)) + + @mock.patch('subprocess.run') + def test_valid_osmo_tokens_not_flagged(self, mock_run): + """Valid OSMO tokens ({{output}}, {{input:0}}) are resolved and not flagged as unresolved.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: valid + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo ok > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{ output }}/result.txt"] + inputs: + - task: producer + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor.execute(spec) + + def test_error_message_suggests_set(self): + """The unresolved token error message suggests using --set to provide values.""" + spec_text = textwrap.dedent('''\ + workflow: + name: helpful + tasks: + - name: task + image: "alpine:3.18" + command: ["echo", "{{missing}}"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + with self.assertRaises(ValueError) as context: + executor.execute(spec) + self.assertIn('--set', str(context.exception)) + + +class TestShmSize(unittest.TestCase): + """Verify that --shm-size is passed to Docker for GPU tasks.""" + + def setUp(self): + """Create a temporary work directory for shm-size tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-shm-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + @mock.patch('subprocess.run') + def test_gpu_task_gets_default_shm_size(self, mock_run): + """A GPU task includes --shm-size with the default value when none is specified.""" + mock_run.return_value = mock.Mock(returncode=0, stdout='0\n') + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-resource: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-resource + command: ["python", "train.py"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['train'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args_list[-1][0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '16g') + + @mock.patch('subprocess.run') + def test_gpu_task_gets_custom_shm_size(self, mock_run): + """A GPU task uses the user-specified --shm-size value.""" + mock_run.return_value = mock.Mock(returncode=0, stdout='0\n') + spec_text = textwrap.dedent('''\ + workflow: + name: shm-test + resources: + gpu-resource: + gpu: 1 + tasks: + - name: train + image: pytorch:latest + resource: gpu-resource + command: ["python", "train.py"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='32g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['train'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args_list[-1][0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '32g') + + @mock.patch('subprocess.run') + def test_non_gpu_task_has_no_default_shm_size(self, mock_run): + """A CPU-only task without explicit shm_size does not include --shm-size.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: no-gpu + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['preprocess'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args[0][0] + self.assertNotIn('--shm-size', docker_call_args) + + @mock.patch('subprocess.run') + def test_non_gpu_task_gets_explicit_shm_size(self, mock_run): + """A CPU-only task gets --shm-size when the user explicitly specifies it.""" + mock_run.return_value = mock.Mock(returncode=0) + spec_text = textwrap.dedent('''\ + workflow: + name: no-gpu + tasks: + - name: preprocess + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True, shm_size='8g') + spec = executor.load_spec(spec_text) + executor._build_dag(spec) + executor._setup_directories() + node = executor._task_nodes['preprocess'] + executor._run_task(node, spec) + + docker_call_args = mock_run.call_args[0][0] + self.assertIn('--shm-size', docker_call_args) + shm_index = docker_call_args.index('--shm-size') + self.assertEqual(docker_call_args[shm_index + 1], '8g') + + +class TestJinjaTemplateDetection(unittest.TestCase): + """Verify that specs containing Jinja template markers are expanded locally before execution.""" + + def test_jinja_block_expanded(self): + """A spec containing {% %} Jinja block tags is detected and expanded locally.""" + spec_text = textwrap.dedent('''\ + workflow: + name: {% if true %}test{% endif %} + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{%', expanded) + self.assertIn('name: test', expanded) + + def test_jinja_comment_expanded(self): + """A spec containing {# #} Jinja comment tags is detected and stripped locally.""" + spec_text = textwrap.dedent('''\ + {# A comment #} + workflow: + name: test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{#', expanded) + self.assertIn('name: test', expanded) + + def test_default_values_section_expanded(self): + """A spec containing a 'default-values' section has its variables expanded locally.""" + spec_text = textwrap.dedent('''\ + workflow: + name: "{{experiment_name}}" + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + default-values: + experiment_name: my-experiment + ''') + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('my-experiment', expanded) + + +# ============================================================================ +# Tests that exercise error paths without requiring Docker +# ============================================================================ +class TestDockerNotFoundHandling(unittest.TestCase): + """Verify graceful failure when Docker is not available (no Docker required to run).""" + + def setUp(self): + """Create a temporary work directory.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-test-') + + def tearDown(self): + """Remove the temporary work directory.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_docker_not_found_graceful_failure(self): + """Using a non-existent docker binary results in a graceful failure rather than a crash.""" + spec_text = textwrap.dedent('''\ + workflow: + name: no-docker + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor( + work_dir=self.work_dir, + keep_work_dir=True, + docker_cmd='nonexistent-docker-binary-12345', + ) + spec = executor.load_spec(spec_text) + self.assertFalse(executor.execute(spec)) + + +class TestCookbookSpecValidation(unittest.TestCase): + """ + Validate that cookbook specs using unsupported features are rejected + before any container is started (no Docker required to run). + """ + + COOKBOOK_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', + 'cookbook', 'tutorials') + + def setUp(self): + """Create a temporary work directory for cookbook validation tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cookbook-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _run_cookbook_spec(self, filename: str) -> bool: + """Execute a cookbook tutorial spec file through the standalone executor.""" + spec_path = os.path.join(self.COOKBOOK_DIR, filename) + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') + return run_workflow_standalone( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + + def test_unsupported_spec_data_download(self): + """data_download.yaml uses URL inputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_download.yaml') + self.assertIn('URL', str(context.exception)) + + def test_unsupported_spec_data_upload(self): + """data_upload.yaml uses URL outputs — verify it is cleanly rejected.""" + with self.assertRaises(ValueError) as context: + self._run_cookbook_spec('data_upload.yaml') + self.assertIn('object storage', str(context.exception).lower()) + + def test_template_spec_expanded_locally(self): + """template_hello_world.yaml uses default-values templating — verify it expands locally.""" + spec_path = os.path.join(self.COOKBOOK_DIR, 'template_hello_world.yaml') + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') + with open(spec_path, encoding='utf-8') as f: + spec_text = f.read() + self.assertTrue(_spec_has_templates(spec_text)) + expanded = _expand_jinja_locally(spec_text) + self.assertNotIn('{{', expanded) + self.assertIn('hello-osmo', expanded) + self.assertIn('Hello from OSMO!', expanded) + + +class TestRunWorkflowStandaloneErrors(unittest.TestCase): + """Test error handling in run_workflow_standalone() that does not require Docker.""" + + def test_nonexistent_file_raises(self): + """Passing a non-existent spec file path raises FileNotFoundError.""" + with self.assertRaises(FileNotFoundError): + run_workflow_standalone(spec_path='/nonexistent/path/spec.yaml') + + +# ============================================================================ +# Integration tests — require Docker; test actual container execution +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestDockerExecution(unittest.TestCase): + """ + Integration tests that run real OSMO workflow specs through the standalone executor + using Docker. Each test uses a spec that would normally run on a Kubernetes cluster. + """ + + def setUp(self): + """Create a temporary work directory for each Docker execution test.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-test-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _execute_spec(self, spec_text: str) -> bool: + """Parse and execute a workflow spec string, returning the success status.""" + executor = StandaloneExecutor(work_dir=self.work_dir, keep_work_dir=True) + spec = executor.load_spec(spec_text) + return executor.execute(spec) + + # ---- Single task tests ---- + + def test_hello_world(self): + """Run a minimal single-task workflow that echoes a message.""" + spec_text = textwrap.dedent('''\ + workflow: + name: hello-osmo + tasks: + - name: hello + image: alpine:3.18 + command: ["echo", "Hello from OSMO!"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_single_task_with_args(self): + """Run a task with separate command and args fields.""" + spec_text = textwrap.dedent('''\ + workflow: + name: args-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo"] + args: ["argument1", "argument2"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_task_failure_returns_false(self): + """A task that exits with a non-zero code causes execute() to return False.""" + spec_text = textwrap.dedent('''\ + workflow: + name: will-fail + tasks: + - name: failing-task + image: alpine:3.18 + command: ["sh", "-c", "exit 42"] + ''') + self.assertFalse(self._execute_spec(spec_text)) + + # ---- Environment variable tests ---- + + def test_environment_variables(self): + """Environment variables declared in the spec are passed to the Docker container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: env-test + tasks: + - name: check-env + image: alpine:3.18 + command: ["sh", "-c"] + args: ["test \\"$MY_VAR\\" = \\"hello_world\\" && test \\"$SECOND\\" = \\"42\\""] + environment: + MY_VAR: hello_world + SECOND: "42" + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Files mount tests ---- + + def test_inline_file_mounted(self): + """An inline file declared in the spec is mounted and executable inside the container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: files-test + tasks: + - name: check-file + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "script ran successfully" + path: /tmp/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + + def test_multiple_files_mounted(self): + """Multiple inline files at different paths are all mounted into the container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: multi-files + tasks: + - name: check-files + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat /tmp/config.txt && sh /scripts/run.sh"] + files: + - contents: "key=value" + path: /tmp/config.txt + - contents: | + echo "second script ok" + path: /scripts/run.sh + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Data output tests ---- + + def test_output_directory_writable(self): + """The {{output}} directory is writable from inside the container and persists on the host.""" + spec_text = textwrap.dedent('''\ + workflow: + name: output-test + tasks: + - name: write-output + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'payload' > {{output}}/result.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + output_file = os.path.join(self.work_dir, 'write-output', 'output', 'result.txt') + self.assertTrue(os.path.exists(output_file)) + with open(output_file) as f: + self.assertEqual(f.read().strip(), 'payload') + + # ---- Serial data flow tests ---- + + def test_serial_data_flow_two_tasks(self): + """Data written to {{output}} by a producer is readable via {{input:0}} by the consumer.""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-data + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'from_producer' > {{output}}/data.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + self.assertTrue(os.path.exists(received)) + with open(received) as f: + self.assertEqual(f.read().strip(), 'from_producer') + + def test_serial_chain_three_tasks(self): + """Mimics cookbook/tutorials/serial_workflow.yaml""" + spec_text = textwrap.dedent('''\ + workflow: + name: serial-chain + tasks: + - name: task1 + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'task1_data' > {{output}}/result.txt"] + + - name: task2 + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + cat {{input:0}}/result.txt > {{output}}/result.txt + echo '_plus_task2' >> {{output}}/result.txt + inputs: + - task: task1 + + - name: task3 + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + cat {{input:0}}/result.txt > {{output}}/final.txt + cat {{input:1}}/result.txt >> {{output}}/final.txt + inputs: + - task: task1 + - task: task2 + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'task3', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('task1_data', content) + self.assertIn('_plus_task2', content) + + # ---- Parallel execution tests ---- + + def test_parallel_independent_tasks(self): + """Independent tasks with no dependencies all execute and produce their respective outputs.""" + spec_text = textwrap.dedent('''\ + workflow: + name: parallel-tasks + tasks: + - name: task-a + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'a' > {{output}}/marker.txt"] + - name: task-b + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'b' > {{output}}/marker.txt"] + - name: task-c + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'c' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + for task_name, expected in [('task-a', 'a'), ('task-b', 'b'), ('task-c', 'c')]: + marker = os.path.join(self.work_dir, task_name, 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), expected) + + # ---- Diamond DAG tests ---- + + def test_diamond_dag(self): + """A diamond-shaped DAG executes correctly with fan-out and fan-in data flow.""" + spec_text = textwrap.dedent('''\ + workflow: + name: diamond + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'root_data' > {{output}}/base.txt"] + - name: left + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'left:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: right + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'right:' > {{output}}/result.txt && cat {{input:0}}/base.txt >> {{output}}/result.txt"] + inputs: + - task: root + - name: join + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/result.txt > {{output}}/final.txt && cat {{input:1}}/result.txt >> {{output}}/final.txt"] + inputs: + - task: left + - task: right + ''') + self.assertTrue(self._execute_spec(spec_text)) + final = os.path.join(self.work_dir, 'join', 'output', 'final.txt') + with open(final) as f: + content = f.read() + self.assertIn('left:', content) + self.assertIn('right:', content) + self.assertIn('root_data', content) + + # ---- Failure propagation tests ---- + + def test_failure_cancels_downstream(self): + """A failed task prevents its downstream dependent from running.""" + spec_text = textwrap.dedent('''\ + workflow: + name: fail-chain + tasks: + - name: failing + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + - name: should-not-run + image: alpine:3.18 + command: ["sh", "-c", "echo 'oops' > {{output}}/should_not_exist.txt"] + inputs: + - task: failing + ''') + self.assertFalse(self._execute_spec(spec_text)) + output_file = os.path.join(self.work_dir, 'should-not-run', 'output', 'should_not_exist.txt') + self.assertFalse(os.path.exists(output_file)) + + def test_parallel_failure_does_not_affect_independent_branch(self): + """When one branch of a parallel DAG fails, the executor stops with overall failure.""" + spec_text = textwrap.dedent('''\ + workflow: + name: partial-fail + tasks: + - name: root + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo ok > {{output}}/data.txt"] + - name: fail-branch + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + inputs: + - task: root + - name: ok-branch + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:0}}/data.txt > {{output}}/received.txt"] + inputs: + - task: root + ''') + result = self._execute_spec(spec_text) + self.assertFalse(result) + + # ---- Groups (ganged tasks) tests ---- + + def test_group_with_single_task(self): + """A group containing a single lead task executes and produces output.""" + spec_text = textwrap.dedent('''\ + workflow: + name: single-group + groups: + - name: my-group + tasks: + - name: leader + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'group_ok' > {{output}}/marker.txt"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + marker = os.path.join(self.work_dir, 'leader', 'output', 'marker.txt') + with open(marker) as f: + self.assertEqual(f.read().strip(), 'group_ok') + + def test_groups_with_data_flow(self): + """Mimics cookbook/tutorials/combination_workflow_simple.yaml structure.""" + spec_text = textwrap.dedent('''\ + workflow: + name: data-pipeline + groups: + - name: prepare-data + tasks: + - name: generate-dataset + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + mkdir -p {{output}}/data + for i in 1 2 3; do echo "sample_$i" >> {{output}}/data/dataset.csv; done + - name: train-models + tasks: + - name: train-model + lead: true + image: alpine:3.18 + command: ["sh", "-c"] + args: + - | + wc -l {{input:0}}/data/dataset.csv > {{output}}/line_count.txt + inputs: + - task: generate-dataset + ''') + self.assertTrue(self._execute_spec(spec_text)) + line_count_file = os.path.join(self.work_dir, 'train-model', 'output', 'line_count.txt') + with open(line_count_file) as f: + content = f.read() + self.assertIn('3', content) + + # ---- Input by task name tests ---- + + def test_input_by_task_name(self): + """The {{input:taskname}} token resolves to the named upstream task's output directory.""" + spec_text = textwrap.dedent('''\ + workflow: + name: named-input + tasks: + - name: producer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["echo 'named_data' > {{output}}/out.txt"] + - name: consumer + image: alpine:3.18 + command: ["sh", "-c"] + args: ["cat {{input:producer}}/out.txt > {{output}}/received.txt"] + inputs: + - task: producer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'consumer', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'named_data') + + # ---- Files with token substitution ---- + + def test_file_contents_with_token_substitution(self): + """Mimics cookbook/tutorials/serial_workflow.yaml pattern of inline scripts with tokens.""" + spec_text = textwrap.dedent('''\ + workflow: + name: file-tokens + tasks: + - name: writer + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + echo "writing output" + echo "file_data" > {{output}}/result.txt + path: /tmp/run.sh + - name: reader + image: alpine:3.18 + command: ["sh", "/tmp/run.sh"] + files: + - contents: | + cat {{input:0}}/result.txt > {{output}}/received.txt + path: /tmp/run.sh + inputs: + - task: writer + ''') + self.assertTrue(self._execute_spec(spec_text)) + received = os.path.join(self.work_dir, 'reader', 'output', 'received.txt') + with open(received) as f: + self.assertEqual(f.read().strip(), 'file_data') + + # ---- Resource spec ignored gracefully ---- + + def test_resources_ignored_gracefully(self): + """Resource specs are K8s-specific; standalone executor should accept and ignore them.""" + spec_text = textwrap.dedent('''\ + workflow: + name: with-resources + resources: + default: + cpu: 2 + memory: 4Gi + storage: 10Gi + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + self.assertTrue(self._execute_spec(spec_text)) + + # ---- Alternative container runtime ---- + + def test_custom_docker_command(self): + """An explicitly specified docker command is used to run the container.""" + spec_text = textwrap.dedent('''\ + workflow: + name: custom-cmd + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''') + executor = StandaloneExecutor( + work_dir=self.work_dir, + keep_work_dir=True, + docker_cmd='docker', + ) + spec = executor.load_spec(spec_text) + self.assertTrue(executor.execute(spec)) + + +# ============================================================================ +# Integration tests using actual cookbook spec files from the repo +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestCookbookSpecs(unittest.TestCase): + """ + Run real OSMO cookbook YAML specs that are designed for Kubernetes clusters, + and verify they execute successfully in the standalone Docker executor. + """ + + COOKBOOK_DIR = os.path.join(os.path.dirname(__file__), '..', '..', '..', + 'cookbook', 'tutorials') + + def setUp(self): + """Create a temporary work directory for cookbook spec tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cookbook-') + + def tearDown(self): + """Remove the temporary work directory after each cookbook test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def _run_cookbook_spec(self, filename: str) -> bool: + """Execute a cookbook tutorial spec file through the standalone executor.""" + spec_path = os.path.join(self.COOKBOOK_DIR, filename) + self.assertTrue(os.path.exists(spec_path), + f'Cookbook file not found: {spec_path}') + return run_workflow_standalone( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + + def test_hello_world_yaml(self): + """Execute the hello_world.yaml cookbook tutorial spec.""" + self.assertTrue(self._run_cookbook_spec('hello_world.yaml')) + + def test_parallel_tasks_yaml(self): + """Execute the parallel_tasks.yaml cookbook tutorial spec.""" + self.assertTrue(self._run_cookbook_spec('parallel_tasks.yaml')) + + def test_serial_workflow_yaml(self): + """Execute the serial_workflow.yaml cookbook tutorial spec.""" + self.assertTrue(self._run_cookbook_spec('serial_workflow.yaml')) + + def test_resources_basic_yaml(self): + """Execute the resources_basic.yaml cookbook tutorial spec.""" + self.assertTrue(self._run_cookbook_spec('resources_basic.yaml')) + + def test_combination_workflow_simple_yaml(self): + """ + The combination_workflow_simple.yaml has a 'sleep 120' in transform-a. + We skip it here because it would take 2+ minutes; a trimmed version + of the same structure is tested in TestDockerExecution.test_groups_with_data_flow. + """ + self.skipTest('Contains sleep 120; covered by test_groups_with_data_flow') + + +# ============================================================================ +# run_workflow_standalone() integration tests +# ============================================================================ +@unittest.skipUnless(DOCKER_AVAILABLE, SKIP_DOCKER_MSG) +class TestRunWorkflowStandalone(unittest.TestCase): + """Test the top-level run_workflow_standalone() convenience function.""" + + def setUp(self): + """Create a temporary work directory for run_workflow_standalone tests.""" + self.work_dir = tempfile.mkdtemp(prefix='osmo-standalone-func-') + + def tearDown(self): + """Remove the temporary work directory after each test.""" + shutil.rmtree(self.work_dir, ignore_errors=True) + + def test_caller_supplied_work_dir_preserved_on_success(self): + """A caller-supplied work_dir is never deleted, even with keep_work_dir=False.""" + work_dir = tempfile.mkdtemp(prefix='osmo-standalone-cleanup-') + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: cleanup-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_standalone( + spec_path=spec_path, + work_dir=work_dir, + keep_work_dir=False, + ) + self.assertTrue(result) + self.assertTrue(os.path.exists(work_dir)) + finally: + os.unlink(spec_path) + if os.path.exists(work_dir): + shutil.rmtree(work_dir, ignore_errors=True) + + def test_failure_preserves_work_dir(self): + """On failure, the work directory is preserved for debugging regardless of the keep flag.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: fail-test + tasks: + - name: task + image: alpine:3.18 + command: ["sh", "-c", "exit 1"] + ''')) + spec_path = f.name + try: + result = run_workflow_standalone( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=False, + ) + self.assertFalse(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + def test_keep_flag_preserves_on_success(self): + """With keep_work_dir=True, the work directory is preserved even on success.""" + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write(textwrap.dedent('''\ + workflow: + name: keep-test + tasks: + - name: task + image: alpine:3.18 + command: ["echo", "ok"] + ''')) + spec_path = f.name + try: + result = run_workflow_standalone( + spec_path=spec_path, + work_dir=self.work_dir, + keep_work_dir=True, + ) + self.assertTrue(result) + self.assertTrue(os.path.exists(self.work_dir)) + finally: + os.unlink(spec_path) + + +if __name__ == '__main__': + unittest.main()