Enable retries for dynamic pipeline function execution

schustmi · schustmi · commit 957fa618ef98 · 2025-12-04T12:08:56.000+08:00
diff --git a/src/zenml/enums.py b/src/zenml/enums.py
@@ -104,6 +104,7 @@ def is_finished(self) -> bool:
             ExecutionStatus.COMPLETED,
             ExecutionStatus.CACHED,
             ExecutionStatus.RETRIED,
+            ExecutionStatus.RETRYING,
             ExecutionStatus.STOPPED,
         }
 
@@ -125,6 +126,20 @@ def is_failed(self) -> bool:
         """
         return self in {ExecutionStatus.FAILED}
 
+    @property
+    def is_in_progress(self) -> bool:
+        """Whether the execution status refers to an in progress execution.
+
+        Returns:
+            Whether the execution status refers to an in progress execution.
+        """
+        return self in {
+            ExecutionStatus.INITIALIZING,
+            ExecutionStatus.PROVISIONING,
+            ExecutionStatus.RUNNING,
+            ExecutionStatus.STOPPING,
+        }
+
 
 class LoggingLevels(Enum):
     """Enum for logging levels."""
diff --git a/src/zenml/execution/pipeline/dynamic/outputs.py b/src/zenml/execution/pipeline/dynamic/outputs.py
@@ -289,6 +289,19 @@ def result(self) -> List[StepRunOutputs]:
         """
         return [future.result() for future in self.futures]
 
+    def load(self, disable_cache: bool = False) -> List[Any]:
+        """Load the step run output artifacts.
+
+        Args:
+            disable_cache: Whether to disable the artifact cache.
+
+        Returns:
+            The step run output artifacts.
+        """
+        return [
+            future.load(disable_cache=disable_cache) for future in self.futures
+        ]
+
     def unpack(self) -> Tuple[List[ArtifactFuture], ...]:
         """Unpack the map results future.
 
diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py
@@ -50,7 +50,10 @@
 from zenml.execution.pipeline.dynamic.run_context import (
     DynamicPipelineRunContext,
 )
-from zenml.execution.pipeline.dynamic.utils import _Unmapped
+from zenml.execution.pipeline.dynamic.utils import (
+    _Unmapped,
+    wait_for_step_run_to_finish,
+)
 from zenml.execution.step.utils import launch_step
 from zenml.logger import get_logger
 from zenml.logging.step_logging import setup_pipeline_logging
@@ -59,9 +62,11 @@
     PipelineRunResponse,
     PipelineRunUpdate,
     PipelineSnapshotResponse,
+    StepRunResponse,
 )
 from zenml.orchestrators.publish_utils import (
     publish_failed_pipeline_run,
+    publish_failed_step_run,
     publish_successful_pipeline_run,
 )
 from zenml.pipelines.dynamic.pipeline_definition import DynamicPipeline
@@ -118,11 +123,20 @@ def __init__(
         self._executor = ThreadPoolExecutor(max_workers=10)
         self._pipeline: Optional["DynamicPipeline"] = None
         self._orchestrator = Stack.from_model(snapshot.stack).orchestrator
-        self._orchestrator_run_id = (
-            self._orchestrator.get_orchestrator_run_id()
-        )
         self._futures: List[StepRunOutputsFuture] = []
 
+        self._existing_step_runs: Dict[str, "StepRunResponse"] = {}
+        if run and run.orchestrator_run_id:
+            logger.info("Continuing existing run `%s`.", str(run.id))
+            self._orchestrator_run_id = run.orchestrator_run_id
+
+            if run.status.is_in_progress:
+                self._existing_step_runs = run.steps.copy()
+        else:
+            self._orchestrator_run_id = (
+                self._orchestrator.get_orchestrator_run_id()
+            )
+
     @property
     def pipeline(self) -> "DynamicPipeline":
         """The pipeline that the runner is executing.
@@ -153,17 +167,28 @@ def pipeline(self) -> "DynamicPipeline":
 
     def run_pipeline(self) -> None:
         """Run the pipeline."""
+        existing_logs_response = None
+        if self._run:
+            for log_response in self._run.log_collection or []:
+                if log_response.source == "orchestrator":
+                    existing_logs_response = log_response
+                    break
+
         with setup_pipeline_logging(
             source="orchestrator",
             snapshot=self._snapshot,
+            logs_response=existing_logs_response,
         ) as logs_request:
             if self._run:
+                run_update = PipelineRunUpdate(
+                    add_logs=[logs_request] if logs_request else None,
+                )
+                if not self._run.orchestrator_run_id:
+                    run_update.orchestrator_run_id = self._orchestrator_run_id
+
                 run = Client().zen_store.update_run(
                     run_id=self._run.id,
-                    run_update=PipelineRunUpdate(
-                        orchestrator_run_id=self._orchestrator_run_id,
-                        add_logs=[logs_request] if logs_request else None,
-                    ),
+                    run_update=run_update,
                 )
             else:
                 run = create_placeholder_run(
@@ -205,6 +230,7 @@ def run_pipeline(self) -> None:
                         self._executor.shutdown(wait=True, cancel_futures=True)
 
                     publish_successful_pipeline_run(run.id)
+                    logger.info("Pipeline completed successfully.")
 
     @overload
     def launch_step(
@@ -260,21 +286,114 @@ def launch_step(
             after=after,
         )
 
-        def _launch() -> StepRunOutputs:
+        should_retry = _should_retry_locally(
+            compiled_step,
+            self._snapshot.pipeline_configuration.docker_settings,
+        )
+
+        def _run_step(
+            remaining_retries: Optional[int] = None,
+        ) -> StepRunOutputs:
+            # TODO: maybe pass run here to avoid extra server requests?
             step_run = launch_step(
                 snapshot=self._snapshot,
                 step=compiled_step,
                 orchestrator_run_id=self._orchestrator_run_id,
-                retry=_should_retry_locally(
-                    compiled_step,
-                    self._snapshot.pipeline_configuration.docker_settings,
-                ),
+                retry=should_retry,
+                remaining_retries=remaining_retries,
             )
             return _load_step_run_outputs(step_run.id)
 
+        existing_step_run = self._existing_step_runs.get(
+            compiled_step.spec.invocation_id
+        )
+        if existing_step_run:
+            if existing_step_run.config != compiled_step.config:
+                logger.warning(
+                    "Configuration for step `%s` changed since the the "
+                    "orchestration environment was restarted. If the step "
+                    "needs to be retried, it will use the new configuration.",
+                    existing_step_run.name,
+                )
+
+            def _workload() -> StepRunOutputs:
+                nonlocal existing_step_run
+                assert existing_step_run
+                
+                if existing_step_run.status.is_successful:
+                    return _load_step_run_outputs(existing_step_run.id)
+
+                runtime = get_step_runtime(
+                    step_config=compiled_step.config,
+                    pipeline_docker_settings=self._snapshot.pipeline_configuration.docker_settings,
+                )
+                if (
+                    runtime == StepRuntime.INLINE
+                    and existing_step_run.status.is_in_progress
+                ):
+                    # Inline steps that are in running state didn't have the
+                    # chance to report their failure back to ZenML before the
+                    # orchestration environment was shut down. But there is no
+                    # way that they're actually still running if we're in a new
+                    # orchestration environment, we we mark them as failed and
+                    # potentially restart them depending on the retry config.
+                    existing_step_run = publish_failed_step_run(
+                        existing_step_run.id
+                    )
+
+                remaining_retries = 0
+
+                if should_retry:
+                    max_retries = (
+                        compiled_step.config.retry.max_retries
+                        if compiled_step.config.retry
+                        else 0
+                    )
+                    remaining_retries = max(
+                        0, 1 + max_retries - existing_step_run.version
+                    )
+
+                if existing_step_run.status.is_in_progress:
+                    logger.info(
+                        "Restarting the monitoring of existing step `%s` "
+                        "(step run ID: %s). Remaining retries: %d",
+                        existing_step_run.name,
+                        existing_step_run.id,
+                        remaining_retries,
+                    )
+
+                if remaining_retries > 0:
+                    step_run = wait_for_step_run_to_finish(
+                        existing_step_run.id
+                    )
+                    if not step_run.status.is_successful:
+                        logger.error(
+                            "Failed to run step `%s`.",
+                            existing_step_run.name,
+                        )
+                        return _run_step(remaining_retries=remaining_retries)
+                    else:
+                        return _load_step_run_outputs(existing_step_run.id)
+                else:
+                    step_run = wait_for_step_run_to_finish(
+                        existing_step_run.id
+                    )
+                    if not step_run.status.is_successful:
+                        # This is the last retry, in which case we have to raise
+                        # an error that the step failed.
+                        # TODO: Make this better by raising the actual exception
+                        # that caused the step to fail instead of just a generic
+                        # runtime error.
+                        raise RuntimeError(
+                            f"Failed to run step `{existing_step_run.name}`."
+                        )
+                    return _load_step_run_outputs(existing_step_run.id)
+        else:
+            _workload = _run_step
+
         if concurrent:
             ctx = contextvars.copy_context()
-            future = self._executor.submit(ctx.run, _launch)
+            future = self._executor.submit(ctx.run, _workload)
             step_run_future = StepRunOutputsFuture(
                 wrapped=future,
                 invocation_id=compiled_step.spec.invocation_id,
@@ -283,7 +402,7 @@ def _launch() -> StepRunOutputs:
             self._futures.append(step_run_future)
             return step_run_future
         else:
-            return _launch()
+            return _workload()
 
     def map(
         self,
@@ -337,12 +456,7 @@ def compile_dynamic_step_invocation(
     pipeline: "DynamicPipeline",
     step: "BaseStep",
     inputs: Dict[str, Any],
-    after: Union[
-        "StepRunFuture",
-        "ArtifactFuture",
-        Sequence[Union["StepRunFuture", "ArtifactFuture"]],
-        None,
-    ] = None,
+    after: Union["StepRunFuture", Sequence["StepRunFuture"], None] = None,
     id: Optional[str] = None,
 ) -> "Step":
     """Compile a dynamic step invocation.
diff --git a/src/zenml/execution/pipeline/dynamic/utils.py b/src/zenml/execution/pipeline/dynamic/utils.py
@@ -13,10 +13,19 @@
 #  permissions and limitations under the License.
 """Dynamic pipeline execution utilities."""
 
+import time
 from typing import (
     Generic,
     TypeVar,
 )
+from uuid import UUID
+
+from zenml.client import Client
+from zenml.logger import get_logger
+from zenml.models import StepRunResponse
+
+logger = get_logger(__name__)
+
 
 T = TypeVar("T")
 
@@ -46,3 +55,31 @@ def unmapped(value: T) -> _Unmapped[T]:
         The wrapped value.
     """
     return _Unmapped(value)
+
+
+def wait_for_step_run_to_finish(step_run_id: UUID) -> "StepRunResponse":
+    """Wait until a step run is finished.
+
+    Args:
+        step_run_id: The ID of the step run.
+
+    Returns:
+        The finished step run.
+    """
+    sleep_interval = 1
+    max_sleep_interval = 64
+
+    while True:
+        step_run = Client().zen_store.get_run_step(step_run_id)
+
+        if step_run.status.is_finished:
+            return step_run
+
+        logger.debug(
+            "Waiting for step run with ID %s to finish (current status: %s)",
+            step_run_id,
+            step_run.status,
+        )
+        time.sleep(sleep_interval)
+        if sleep_interval < max_sleep_interval:
+            sleep_interval *= 2
diff --git a/src/zenml/execution/step/utils.py b/src/zenml/execution/step/utils.py
@@ -16,6 +16,7 @@
 import time
 from typing import (
     TYPE_CHECKING,
+    Optional,
 )
 
 from zenml.config.step_configurations import Step
@@ -39,6 +40,7 @@ def launch_step(
     step: "Step",
     orchestrator_run_id: str,
     retry: bool = False,
+    remaining_retries: Optional[int] = None,
 ) -> StepRunResponse:
     """Launch a step.
 
@@ -47,6 +49,8 @@ def launch_step(
         step: The step to run.
         orchestrator_run_id: The orchestrator run ID.
         retry: Whether to retry the step if it fails.
+        remaining_retries: The number of remaining retries. If not passed, this
+            will be read from the step configuration.
 
     Raises:
         RunStoppedException: If the run was stopped.
@@ -69,7 +73,10 @@ def _launch_without_retry() -> StepRunResponse:
     else:
         retries = 0
         retry_config = step.config.retry
-        max_retries = retry_config.max_retries if retry_config else 0
+        if remaining_retries is None:
+            max_retries = retry_config.max_retries if retry_config else 0
+        else:
+            max_retries = remaining_retries
         delay = retry_config.delay if retry_config else 0
         backoff = retry_config.backoff if retry_config else 1
 
diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator.py
@@ -775,12 +775,7 @@ def _submit_orchestrator_job(
                 annotations=annotations,
                 settings=settings,
                 pod_settings=orchestrator_pod_settings,
-                # In dynamic pipelines restarting the orchestrator pod is not
-                # supported yet. It will create new runs for each restart which
-                # we have to avoid.
-                backoff_limit=0
-                if snapshot.is_dynamic
-                else settings.orchestrator_job_backoff_limit,
+                backoff_limit=settings.orchestrator_job_backoff_limit,
             )
 
             if snapshot.schedule: