Update KubeflowExecutor to use CommandTrainer

jskswamy · jskswamy · commit 03bb57efc8f0 · 2025-09-12T15:09:50.000+05:30
Refactor the KubeflowExecutor class to replace the CustomTrainer
with CommandTrainer for improved task handling. Introduce a
new enable_tcpxo feature that configures a sidecar for TCP
enhancements in the runtime template. The implementation now
validates entrypoints and manages task configurations more
robustly, ensuring compatibility with the CommandTrainer.

- Added enable_tcpxo flag to runtime template
- Updated TrainerClient initialization with KubernetesBackendConfig
- Enhanced error handling for unsupported tasks
- Improved logging for trainer configurations and commands

Signed-off-by: Krishnaswamy Subramanian &lt;subramk@thoughtworks.com&gt;
diff --git a/nemo_run/core/execution/kubeflow.py b/nemo_run/core/execution/kubeflow.py
@@ -20,7 +20,8 @@
 from typing import Any, Dict, Optional, Union
 
 import yaml
-from kubeflow.trainer import CustomTrainer, TrainerClient
+from kubeflow.trainer import CommandTrainer, TrainerClient
+from kubeflow.trainer.backends.kubernetes.types import KubernetesBackendConfig
 from kubernetes import client, config
 from kubernetes.client.exceptions import ApiException
 
@@ -46,7 +47,7 @@ class KubeflowExecutor(Executor):
 
     Example:
 
-    .. code-block:: python
+    . code-block:: python
 
         # Configure executor for execution environment
         executor = KubeflowExecutor(
@@ -104,6 +105,9 @@ class KubeflowExecutor(Executor):
     #: Detach mode flag (set by experiment framework)
     _detach_mode: bool = field(init=False, default=False)
 
+    #: Enable tcpxo sidecar and related mounts/env in runtime template
+    enable_tcpxo: bool = False
+
     def __post_init__(self):
         """Validate executor configuration and setup Kubernetes access."""
         if self.nodes < 1:
@@ -213,7 +217,8 @@ def _get_trainer_client(self) -> TrainerClient:
         """Get or create a TrainerClient instance."""
         if self._trainer_client is None:
             # Initialize client with the executor's namespace
-            self._trainer_client = TrainerClient(namespace=self.namespace)
+            k8s_backend_config = KubernetesBackendConfig(namespace=self.namespace)
+            self._trainer_client = TrainerClient(backend_config=k8s_backend_config)
         return self._trainer_client
 
     def _create_cluster_training_runtime(self, configmap_name: str, sha: str) -> str:
@@ -234,6 +239,7 @@ def _create_cluster_training_runtime(self, configmap_name: str, sha: str) -> str
             "cpu_limit": self.cpu_limit,
             "memory_limit": self.memory_limit,
             "gpus": self.gpus,
+            "enable_tcpxo": self.enable_tcpxo,
         }
         rendered = fill_template(
             template_name="kubeflow_clustertrainingruntime.yaml.j2",
@@ -326,10 +332,8 @@ def _get_additional_files(self, task) -> dict[str, tuple[str, str]]:
                 logger.info("Script task - will stage content in ConfigMap")
 
         elif hasattr(task, "__fn_or_cls__"):
-            # Partial task - will be handled directly by CustomTrainer, no ConfigMap staging needed
-            logger.info(
-                "Partial task - will be passed directly to CustomTrainer, skipping ConfigMap staging"
-            )
+            # Partial support not implemented yet for CommandTrainer path
+            logger.warning("Partial tasks are not yet supported with Kubeflow CommandTrainer.")
 
         return files_to_stage
 
@@ -370,43 +374,51 @@ def cleanup_files(self, task_dir: str, task=None):
             # Use experiment-specific naming for cleanup
             self.packager.cleanup(self._get_experiment_identifier())
 
-    def _get_custom_trainer(self, task) -> CustomTrainer:
-        """Get the CustomTrainer configuration for the training job."""
-        trainer_kwargs: dict = {"num_nodes": self.nodes}
+    def _get_custom_trainer(self, task) -> CommandTrainer:
+        """Build a CommandTrainer for a Script task. Partial is not yet supported."""
+        # Reject Partial until implemented
+        if hasattr(task, "__fn_or_cls__"):
+            raise NotImplementedError(
+                "Partial tasks are not yet supported with Kubeflow CommandTrainer"
+            )
+
         resources_per_node: dict = {}
         if self.cpu_limit is not None:
             resources_per_node["cpu"] = self.cpu_limit
         if self.memory_limit is not None:
             resources_per_node["memory"] = self.memory_limit
         if self.gpus is not None:
             resources_per_node["nvidia.com/gpu"] = str(self.gpus)
-        trainer_kwargs["resources_per_node"] = resources_per_node
 
-        if hasattr(task, "__fn_or_cls__"):
-            trainer_kwargs["func"] = task.__fn_or_cls__
-            if hasattr(task, "__arguments__") and task.__arguments__:
-                trainer_kwargs["func_args"] = task.__arguments__
+        # Determine command/args based on entrypoint
+        entrypoint = getattr(task, "entrypoint", "bash") or "bash"
+        mounted_path = f"{self.volume_mount_path}/{self.training_entry}"
+
+        command: list[str]
+        args: list[str]
+        ep_lower = entrypoint.lower()
+        if "bash" in ep_lower:
+            command = ["/bin/bash"]
+            args = ["-c", mounted_path]
+        elif "python" in ep_lower:
+            command = ["python"]
+            args = [mounted_path]
         else:
-            # Script task - set python_file and check for bash scripts
-            trainer_kwargs["python_file"] = f"{self.volume_mount_path}/{self.training_entry}"
-
-            # Check if this is a bash script and set appropriate command
-            if hasattr(task, "inline") and task.inline:
-                entrypoint = getattr(task, "entrypoint", "bash")
-                if entrypoint and "bash" in entrypoint.lower():
-                    trainer_kwargs["command"] = ["/bin/bash"]
-                    logger.info("Using bash command for script execution")
-                # For Python scripts, let SDK auto-detect based on runtime
-
-        # Debug logging to see what we're passing to CustomTrainer
-        logger.info(f"Creating CustomTrainer with kwargs: {trainer_kwargs}")
-
-        trainer = CustomTrainer(**trainer_kwargs)
+            # Fallback: treat entrypoint as executable to run the staged file
+            command = [entrypoint]
+            args = [mounted_path]
+
+        trainer = CommandTrainer(
+            command=command,
+            args=args,
+            num_nodes=self.nodes,
+            resources_per_node=resources_per_node,
+        )
 
-        # Debug logging to see what CustomTrainer actually received
-        logger.info(f"CustomTrainer created with func: {trainer.func}")
-        logger.info(f"CustomTrainer created with func_args: {trainer.func_args}")
-        logger.info(f"CustomTrainer created with python_file: {trainer.python_file}")
+        logger.info(
+            f"CommandTrainer created with command={trainer.command}, args={trainer.args}, "
+            f"num_nodes={trainer.num_nodes}, resources_per_node={trainer.resources_per_node}"
+        )
 
         return trainer
 
@@ -442,11 +454,15 @@ def delete_trainjob(self, job_name: str):
         except Exception as e:
             logger.error(f"Failed to delete TrainJob: {e}")
 
-    def get_trainjob_logs(self, job_name: str, follow: bool = False) -> dict:
+    def get_trainjob_logs(self, job_name: str, follow: bool = False):
         """Get logs from a TrainJob."""
         try:
             client = self._get_trainer_client()
-            return client.get_job_logs(job_name, follow=follow)
+            logs_iter = client.get_job_logs(job_name, follow=follow)
+            # Some tests mock this as a dict; in real SDK it's an Iterator[str]
+            if isinstance(logs_iter, dict):
+                return logs_iter
+            return logs_iter
         except Exception as e:
             logger.error(f"Failed to get TrainJob logs: {e}")
             return {}
@@ -529,3 +545,17 @@ def _runtime_name(self, sha: str) -> str:
         """Build CRT name from the shared experiment identifier and sha."""
         identifier = self._get_experiment_identifier()
         return sanitize_kubernetes_name(f"nemo-runtime-{identifier}-{sha}")
+
+    def _get_staged_file_path(self, filename: str) -> str:
+        """Return path where a staged file would be mounted inside the container.
+
+        If using ConfigMapPackager, files are mounted under volume_mount_path with
+        experiment-specific prefix. Otherwise, return the filename unchanged.
+        """
+        if (
+            isinstance(self.packager, ConfigMapPackager)
+            and hasattr(self, "experiment_name")
+            and self.experiment_name
+        ):
+            return f"{self.volume_mount_path}/{self.experiment_name}-{filename}"
+        return filename
diff --git a/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2 b/nemo_run/core/execution/templates/kubeflow_clustertrainingruntime.yaml.j2
@@ -3,6 +3,8 @@ kind: ClusterTrainingRuntime
 metadata:
   name: {{ runtime_name }}
   namespace: {{ namespace }}
+  labels:
+    trainer.kubeflow.org/framework: torch
 spec:
   mlPolicy:
     numNodes: {{ nodes }}
@@ -17,22 +19,110 @@ spec:
             metadata:
               labels:
                 trainer.kubeflow.org/trainjob-ancestor-step: trainer
+            {% if enable_tcpxo %}
+            annotations:
+              devices.gke.io/container.tcpxo-daemon: |
+                - path: /dev/nvidia0
+                - path: /dev/nvidia1
+                - path: /dev/nvidia2
+                - path: /dev/nvidia3
+                - path: /dev/nvidia4
+                - path: /dev/nvidia5
+                - path: /dev/nvidia6
+                - path: /dev/nvidia7
+                - path: /dev/nvidiactl
+                - path: /dev/nvidia-uvm
+                - path: /dev/dmabuf_import_helper
+              networking.gke.io/default-interface: eth0
+              networking.gke.io/interfaces: |
+                [
+                  {"interfaceName":"eth0","network":"default"},
+                  {"interfaceName":"eth1","network":"vpc1"},
+                  {"interfaceName":"eth2","network":"vpc2"},
+                  {"interfaceName":"eth3","network":"vpc3"},
+                  {"interfaceName":"eth4","network":"vpc4"},
+                  {"interfaceName":"eth5","network":"vpc5"},
+                  {"interfaceName":"eth6","network":"vpc6"},
+                  {"interfaceName":"eth7","network":"vpc7"},
+                  {"interfaceName":"eth8","network":"vpc8"}
+                ]
+            {% endif %}
             spec:
               template:
                 spec:
                   volumes:
                     - name: workspace
                       configMap:
                         name: {{ configmap_name }}
+                        defaultMode: 0755
+                    - name: mistral-checkpoint
+                      persistentVolumeClaim:
+                        claimName: mistral-checkpoint
+                    - name: libraries
+                      hostPath:
+                        path: /home/kubernetes/bin/nvidia/lib64
+                    - name: sys
+                      hostPath:
+                        path: /sys
+                    - name: proc-sys
+                      hostPath:
+                        path: /proc/sys
+                    - name: aperture-devices
+                      hostPath:
+                        path: /dev/aperture_devices
+                    - name: dshm
+                      emptyDir:
+                        medium: Memory
+                        sizeLimit: 2048Gi
                   containers:
                     - name: node
                       image: {{ image }}
+                      env:
+                        - name: LD_LIBRARY_PATH
+                          value: /usr/local/nvidia/lib64
+                        - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
+                          value: /dev/aperture_devices
                       volumeMounts:
                         - name: workspace
                           mountPath: {{ volume_mount_path }}
+                        - name: mistral-checkpoint
+                          mountPath: /workspace
+                        - name: dshm
+                          mountPath: /dev/shm
+                        - name: aperture-devices
+                          mountPath: /dev/aperture_devices
                       resources:
-                        requests: {}
+                        requests:
+                          {% if cpu_limit %}cpu: {{ cpu_limit }}{% endif %}
+                          {% if memory_limit %}memory: {{ memory_limit }}{% endif %}
+                          {% if gpus %}"nvidia.com/gpu": {{ gpus }}{% endif %}
                         limits:
                           {% if cpu_limit %}cpu: {{ cpu_limit }}{% endif %}
                           {% if memory_limit %}memory: {{ memory_limit }}{% endif %}
                           {% if gpus %}"nvidia.com/gpu": {{ gpus }}{% endif %}
+                    {% if enable_tcpxo %}
+                    - name: tcpxo-daemon
+                      image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.15
+                      imagePullPolicy: Always
+                      command: ["/bin/sh", "-c"]
+                      args:
+                        - |
+                          set -ex
+                          chmod 755 /fts/entrypoint_rxdm_container.sh
+                          /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
+                      env:
+                        - name: LD_LIBRARY_PATH
+                          value: /usr/local/nvidia/lib64
+                      securityContext:
+                        capabilities:
+                          add:
+                            - NET_ADMIN
+                            - NET_BIND_SERVICE
+                      volumeMounts:
+                        - name: libraries
+                          mountPath: /usr/local/nvidia
+                        - name: sys
+                          mountPath: /hostsysfs
+                        - name: proc-sys
+                          mountPath: /hostprocsysfs
+                    {% endif %}
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "toml",
     "kubernetes>=28.0.0",
-    "kubeflow @ git+https://github.com/jskswamy/kubeflow-sdk.git#subdirectory=python",
+    "kubeflow @ git+https://github.com/jskswamy/kubeflow-sdk.git@main",
 ]
 readme = "README.md"
 requires-python = ">= 3.10"
@@ -58,7 +58,7 @@ skypilot-all = ["skypilot[all]>=0.10.0"]
 ray = ["kubernetes"]
 kubernetes = [
     "kubernetes>=28.0.0",
-    "kubeflow @ git+https://github.com/jskswamy/kubeflow-sdk.git#subdirectory=python",
+    "kubeflow @ git+https://github.com/jskswamy/kubeflow-sdk.git@main",
 ]
 
 [dependency-groups]
diff --git a/test/core/execution/test_kubeflow.py b/test/core/execution/test_kubeflow.py