From 0255dd31f49b550a777f58daa189972633277516 Mon Sep 17 00:00:00 2001 From: Aleksandr Dremov Date: Thu, 18 Dec 2025 12:22:58 +0100 Subject: [PATCH 1/2] Make distributed mode automatic --- csub.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/csub.py b/csub.py index 5893942..d4575b3 100644 --- a/csub.py +++ b/csub.py @@ -48,7 +48,6 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("-i", "--image", type=str, help="Override RUNAI_IMAGE from the env file") parser.add_argument("-p", "--port", type=int, help="Expose a container port") parser.add_argument("--train", action="store_true", help="Submit as a training workload") - parser.add_argument("--distributed", action="store_true", help="Submit a distributed workload") parser.add_argument("--workers", default=0, type=int, help="Only read for distributed workloads. Number of nodes IN ADDITION to the master node. I.e., the total number of nodes is the number of workers + 1 (the master node)") parser.add_argument("--dry", action="store_true", help="Print the generated runai command") parser.add_argument("--env-file", type=str, default=DEFAULT_ENV_FILE, help="Path to the .env file (default: .env in the repo root)") @@ -66,8 +65,10 @@ def build_parser() -> argparse.ArgumentParser: def build_runai_command( args: argparse.Namespace, env: Dict[str, str] ) -> Tuple[List[str], str]: - assert args.train + args.distributed <= 1, "Choose --train or --distributed but not both" - + distributed = args.workers > 0 + if not args.train and distributed: + args.train |= distributed + print("Forcing non-interactive as distributed") job_name = ( args.name or f"{env['LDAP_USERNAME']}-{datetime.now().strftime('%Y%m%d-%H%M%S')}" @@ -130,7 +131,7 @@ def build_runai_command( shell_command += f" && {user_command}" cmd: List[str] = ["runai"] - cmd.extend(["submit-dist", "pytorch"] if args.distributed else ["submit"]) + cmd.extend(["submit-dist", "pytorch"] if distributed else ["submit"]) cmd.extend([ "--name", job_name, @@ -158,7 +159,7 @@ def build_runai_command( if args.memory: cmd.extend(["--memory", args.memory]) - if not args.train and not args.distributed: + if not args.train: cmd.append("--interactive") else: cmd.extend(["--backoff-limit", str(args.backofflimit)]) @@ -173,10 +174,10 @@ def build_runai_command( if args.node_type: cmd.extend(["--node-pools", args.node_type]) - if args.node_type in {"h200", "h100"} and not args.train and not args.distributed: + if args.node_type in {"h200", "h100"} and not args.train: cmd.append("--preemptible") - if args.distributed: + if distributed: cmd.extend([ "--workers", str(args.workers), "--annotation", "k8s.v1.cni.cncf.io/networks=kube-system/roce", From f7553375874a095f5148497b2c7d82fa5ae885a6 Mon Sep 17 00:00:00 2001 From: Aleksandr Dremov Date: Thu, 18 Dec 2025 12:24:58 +0100 Subject: [PATCH 2/2] Update help text for --workers argument --- csub.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csub.py b/csub.py index d4575b3..dc91dfd 100644 --- a/csub.py +++ b/csub.py @@ -48,7 +48,7 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("-i", "--image", type=str, help="Override RUNAI_IMAGE from the env file") parser.add_argument("-p", "--port", type=int, help="Expose a container port") parser.add_argument("--train", action="store_true", help="Submit as a training workload") - parser.add_argument("--workers", default=0, type=int, help="Only read for distributed workloads. Number of nodes IN ADDITION to the master node. I.e., the total number of nodes is the number of workers + 1 (the master node)") + parser.add_argument("--workers", default=0, type=int, help="Number of nodes IN ADDITION to the master node. I.e., the total number of nodes is the number of workers + 1 (the master node)") parser.add_argument("--dry", action="store_true", help="Print the generated runai command") parser.add_argument("--env-file", type=str, default=DEFAULT_ENV_FILE, help="Path to the .env file (default: .env in the repo root)") parser.add_argument("--sync-secret-only", action="store_true", help="Create/refresh the Kubernetes secret and exit without submitting a job")